def __init__( self, crawler, interval, project, subscription, max_messages=100, prevent_rescrape_for=None, pull_timeout=10, ): self.client = pubsub_client() if not self.client: LOGGER.error("Google Cloud Pub/Sub Client could not be initialised") raise NotConfigured # pylint: disable=no-member self.subscription_path = self.client.subscription_path(project, subscription) self.max_messages = max_messages prevent_rescrape_for = ( prevent_rescrape_for if isinstance(prevent_rescrape_for, timedelta) else parse_float(prevent_rescrape_for) ) self.prevent_rescrape_for = ( timedelta(seconds=prevent_rescrape_for) if isinstance(prevent_rescrape_for, float) else prevent_rescrape_for ) self.pull_timeout = parse_float(pull_timeout) or 10 self.last_scraped = {} self.setup_looping_task(self._pull_queue, crawler, interval)
def __init__(self, tag_file, date=None, seconds=None): date = parse_date(date, tzinfo=timezone.utc) seconds = parse_float(seconds) if not date and not seconds: raise NotConfigured self.tag_file = Path(tag_file).resolve() self.tag_file.parent.mkdir(parents=True, exist_ok=True) self.date = date self.seconds = seconds
def _parse_timeout(timeout): if timeout is None or timeout == "": return None timeout_float = parse_float(timeout) if timeout_float is not None: return timeout_float try: import pytimeparse except ImportError: return None return pytimeparse.parse(timeout)
def parse_game(self, response): # pylint: disable=line-too-long """ @url https://www.boardgamegeek.com/xmlapi2/thing?id=13,822,36218&stats=1&versions=1&videos=1&ratingcomments=1&page=1&pagesize=100 @returns items 3 3 @returns requests 303 303 @scrapes name alt_name year description \ designer artist publisher \ url image_url video_url \ min_players max_players min_players_rec max_players_rec \ min_players_best max_players_best \ min_age min_age_rec min_time max_time \ game_type category mechanic cooperative compilation family expansion \ rank add_rank num_votes avg_rating stddev_rating \ bayes_rating worst_rating best_rating \ complexity easiest_complexity hardest_complexity \ language_dependency lowest_language_dependency highest_language_dependency \ bgg_id scraped_at """ profile_url = response.meta.get("profile_url") scraped_at = now() for game in response.xpath("/items/item"): bgg_id = parse_int( game.xpath("@id").extract_first() or response.meta.get("bgg_id")) page = parse_int( game.xpath("comments/@page").extract_first() or response.meta.get("page")) total_items = parse_int( game.xpath("comments/@totalitems").extract_first() or response.meta.get("total_items")) comments = game.xpath( "comments/comment") if self.scrape_ratings else () if (page is not None and total_items is not None and comments and page * self.page_size < total_items): # pylint: disable=invalid-unary-operand-type yield self._game_request( bgg_id, page=page + 1, priority=-page, skip_game_item=True, profile_url=profile_url, ) for comment in comments: user_name = comment.xpath("@username").extract_first() if not user_name: self.logger.warning( "no user name found, cannot process rating") continue user_name = user_name.lower() if self.scrape_collections: yield self.collection_request(user_name) continue yield self._user_item_or_request(user_name, scraped_at=scraped_at) ldr = RatingLoader( item=RatingItem( item_id=f"{user_name}:{bgg_id}", bgg_id=bgg_id, bgg_user_name=user_name, scraped_at=scraped_at, ), selector=comment, response=response, ) ldr.add_xpath("bgg_user_rating", "@rating") ldr.add_xpath("comment", "@value") yield ldr.load_item() if response.meta.get("skip_game_item"): continue ldr = GameLoader( item=GameItem( bgg_id=bgg_id, scraped_at=scraped_at, worst_rating=1, best_rating=10, easiest_complexity=1, hardest_complexity=5, lowest_language_dependency=1, highest_language_dependency=5, ), selector=game, response=response, ) ldr.add_xpath("name", 'name[@type = "primary"]/@value') ldr.add_xpath("alt_name", "name/@value") ldr.add_xpath("year", "yearpublished/@value") ldr.add_xpath("description", "description") ldr.add_value( "designer", _value_id(game.xpath('link[@type = "boardgamedesigner"]'))) ldr.add_value( "artist", _value_id(game.xpath('link[@type = "boardgameartist"]'))) ldr.add_value( "publisher", _value_id(game.xpath('link[@type = "boardgamepublisher"]'))) ldr.add_value("url", profile_url) ldr.add_value( "url", "https://boardgamegeek.com/boardgame/{}".format(bgg_id)) images = game.xpath("image/text()").extract() ldr.add_value("image_url", (response.urljoin(i) for i in images)) images = game.xpath("thumbnail/text()").extract() ldr.add_value("image_url", (response.urljoin(i) for i in images)) videos = game.xpath("videos/video/@link").extract() ldr.add_value("video_url", (response.urljoin(v) for v in videos)) ( min_players_rec, max_players_rec, min_players_best, max_players_best, ) = self._player_count_votes(game) ldr.add_xpath("min_players", "minplayers/@value") ldr.add_xpath("max_players", "maxplayers/@value") ldr.add_value("min_players_rec", min_players_rec) ldr.add_value("max_players_rec", max_players_rec) ldr.add_value("min_players_best", min_players_best) ldr.add_value("max_players_best", max_players_best) ldr.add_xpath("min_age", "minage/@value") ldr.add_xpath("max_age", "maxage/@value") ldr.add_value( "min_age_rec", self._poll(game, "suggested_playerage", func=statistics.median_grouped), ) ldr.add_xpath("min_time", "minplaytime/@value") ldr.add_xpath("min_time", "playingtime/@value") ldr.add_xpath("max_time", "maxplaytime/@value") ldr.add_xpath("max_time", "playingtime/@value") ldr.add_xpath("max_time", "minplaytime/@value") ldr.add_value( "game_type", _value_id_rank( game.xpath( 'statistics/ratings/ranks/rank[@type = "family"]')), ) ldr.add_value( "category", _value_id(game.xpath('link[@type = "boardgamecategory"]'))) ldr.add_value( "mechanic", _value_id(game.xpath('link[@type = "boardgamemechanic"]'))) # look for <link type="boardgamemechanic" id="2023" value="Co-operative Play" /> ldr.add_value( "cooperative", bool( game.xpath( 'link[@type = "boardgamemechanic" and @id = "2023"]')), ) ldr.add_value( "compilation", bool( game.xpath( 'link[@type = "boardgamecompilation" and @inbound = "true"]' )), ) ldr.add_xpath( "compilation_of", 'link[@type = "boardgamecompilation" and @inbound = "true"]/@id', ) ldr.add_value( "family", _value_id(game.xpath('link[@type = "boardgamefamily"]'))) ldr.add_value( "expansion", _value_id(game.xpath('link[@type = "boardgameexpansion"]'))) ldr.add_xpath( "implementation", 'link[@type = "boardgameimplementation" and @inbound = "true"]/@id', ) ldr.add_xpath("integration", 'link[@type = "boardgameintegration"]/@id') ldr.add_xpath( "rank", 'statistics/ratings/ranks/rank[@name = "boardgame"]/@value') ldr.add_xpath("num_votes", "statistics/ratings/usersrated/@value") ldr.add_xpath("avg_rating", "statistics/ratings/average/@value") ldr.add_xpath("stddev_rating", "statistics/ratings/stddev/@value") ldr.add_xpath("bayes_rating", "statistics/ratings/bayesaverage/@value") ldr.add_xpath("complexity", "statistics/ratings/averageweight/@value") ldr.add_value( "language_dependency", self._poll( game, "language_dependence", attr="level", enum=True, func=statistics.median_grouped, ), ) for rank in game.xpath( 'statistics/ratings/ranks/rank[@type = "family"]'): add_rank = { "game_type": rank.xpath("@name").extract_first(), "game_type_id": parse_int(rank.xpath("@id").extract_first()), "name": _remove_rank(rank.xpath("@friendlyname").extract_first()), "rank": parse_int(rank.xpath("@value").extract_first()), "bayes_rating": parse_float(rank.xpath("@bayesaverage").extract_first()), } ldr.add_value("add_rank", add_rank) yield ldr.load_item()
def abs_comp(field_1, field_2): """ returns absolute value of difference if both arguments are valid, else inf """ field_1 = parse_float(field_1) field_2 = parse_float(field_2) return math.inf if field_1 is None or field_2 is None else abs(field_1 - field_2)