Beispiel #1
0
    def __init__(
        self,
        crawler,
        interval,
        project,
        subscription,
        max_messages=100,
        prevent_rescrape_for=None,
        pull_timeout=10,
    ):
        self.client = pubsub_client()

        if not self.client:
            LOGGER.error("Google Cloud Pub/Sub Client could not be initialised")
            raise NotConfigured

        # pylint: disable=no-member
        self.subscription_path = self.client.subscription_path(project, subscription)
        self.max_messages = max_messages

        prevent_rescrape_for = (
            prevent_rescrape_for
            if isinstance(prevent_rescrape_for, timedelta)
            else parse_float(prevent_rescrape_for)
        )
        self.prevent_rescrape_for = (
            timedelta(seconds=prevent_rescrape_for)
            if isinstance(prevent_rescrape_for, float)
            else prevent_rescrape_for
        )
        self.pull_timeout = parse_float(pull_timeout) or 10
        self.last_scraped = {}

        self.setup_looping_task(self._pull_queue, crawler, interval)
Beispiel #2
0
    def __init__(self, tag_file, date=None, seconds=None):
        date = parse_date(date, tzinfo=timezone.utc)
        seconds = parse_float(seconds)

        if not date and not seconds:
            raise NotConfigured

        self.tag_file = Path(tag_file).resolve()
        self.tag_file.parent.mkdir(parents=True, exist_ok=True)
        self.date = date
        self.seconds = seconds
Beispiel #3
0
def _parse_timeout(timeout):
    if timeout is None or timeout == "":
        return None

    timeout_float = parse_float(timeout)
    if timeout_float is not None:
        return timeout_float

    try:
        import pytimeparse
    except ImportError:
        return None

    return pytimeparse.parse(timeout)
Beispiel #4
0
    def parse_game(self, response):
        # pylint: disable=line-too-long
        """
        @url https://www.boardgamegeek.com/xmlapi2/thing?id=13,822,36218&stats=1&versions=1&videos=1&ratingcomments=1&page=1&pagesize=100
        @returns items 3 3
        @returns requests 303 303
        @scrapes name alt_name year description \
            designer artist publisher \
            url image_url video_url \
            min_players max_players min_players_rec max_players_rec \
            min_players_best max_players_best \
            min_age min_age_rec min_time max_time \
            game_type category mechanic cooperative compilation family expansion \
            rank add_rank num_votes avg_rating stddev_rating \
            bayes_rating worst_rating best_rating \
            complexity easiest_complexity hardest_complexity \
            language_dependency lowest_language_dependency highest_language_dependency \
            bgg_id scraped_at
        """

        profile_url = response.meta.get("profile_url")
        scraped_at = now()

        for game in response.xpath("/items/item"):
            bgg_id = parse_int(
                game.xpath("@id").extract_first()
                or response.meta.get("bgg_id"))
            page = parse_int(
                game.xpath("comments/@page").extract_first()
                or response.meta.get("page"))
            total_items = parse_int(
                game.xpath("comments/@totalitems").extract_first()
                or response.meta.get("total_items"))
            comments = game.xpath(
                "comments/comment") if self.scrape_ratings else ()

            if (page is not None and total_items is not None and comments
                    and page * self.page_size < total_items):
                # pylint: disable=invalid-unary-operand-type
                yield self._game_request(
                    bgg_id,
                    page=page + 1,
                    priority=-page,
                    skip_game_item=True,
                    profile_url=profile_url,
                )

            for comment in comments:
                user_name = comment.xpath("@username").extract_first()

                if not user_name:
                    self.logger.warning(
                        "no user name found, cannot process rating")
                    continue

                user_name = user_name.lower()

                if self.scrape_collections:
                    yield self.collection_request(user_name)
                    continue

                yield self._user_item_or_request(user_name,
                                                 scraped_at=scraped_at)

                ldr = RatingLoader(
                    item=RatingItem(
                        item_id=f"{user_name}:{bgg_id}",
                        bgg_id=bgg_id,
                        bgg_user_name=user_name,
                        scraped_at=scraped_at,
                    ),
                    selector=comment,
                    response=response,
                )
                ldr.add_xpath("bgg_user_rating", "@rating")
                ldr.add_xpath("comment", "@value")
                yield ldr.load_item()

            if response.meta.get("skip_game_item"):
                continue

            ldr = GameLoader(
                item=GameItem(
                    bgg_id=bgg_id,
                    scraped_at=scraped_at,
                    worst_rating=1,
                    best_rating=10,
                    easiest_complexity=1,
                    hardest_complexity=5,
                    lowest_language_dependency=1,
                    highest_language_dependency=5,
                ),
                selector=game,
                response=response,
            )

            ldr.add_xpath("name", 'name[@type = "primary"]/@value')
            ldr.add_xpath("alt_name", "name/@value")
            ldr.add_xpath("year", "yearpublished/@value")
            ldr.add_xpath("description", "description")

            ldr.add_value(
                "designer",
                _value_id(game.xpath('link[@type = "boardgamedesigner"]')))
            ldr.add_value(
                "artist",
                _value_id(game.xpath('link[@type = "boardgameartist"]')))
            ldr.add_value(
                "publisher",
                _value_id(game.xpath('link[@type = "boardgamepublisher"]')))

            ldr.add_value("url", profile_url)
            ldr.add_value(
                "url", "https://boardgamegeek.com/boardgame/{}".format(bgg_id))
            images = game.xpath("image/text()").extract()
            ldr.add_value("image_url", (response.urljoin(i) for i in images))
            images = game.xpath("thumbnail/text()").extract()
            ldr.add_value("image_url", (response.urljoin(i) for i in images))
            videos = game.xpath("videos/video/@link").extract()
            ldr.add_value("video_url", (response.urljoin(v) for v in videos))

            (
                min_players_rec,
                max_players_rec,
                min_players_best,
                max_players_best,
            ) = self._player_count_votes(game)

            ldr.add_xpath("min_players", "minplayers/@value")
            ldr.add_xpath("max_players", "maxplayers/@value")
            ldr.add_value("min_players_rec", min_players_rec)
            ldr.add_value("max_players_rec", max_players_rec)
            ldr.add_value("min_players_best", min_players_best)
            ldr.add_value("max_players_best", max_players_best)

            ldr.add_xpath("min_age", "minage/@value")
            ldr.add_xpath("max_age", "maxage/@value")
            ldr.add_value(
                "min_age_rec",
                self._poll(game,
                           "suggested_playerage",
                           func=statistics.median_grouped),
            )
            ldr.add_xpath("min_time", "minplaytime/@value")
            ldr.add_xpath("min_time", "playingtime/@value")
            ldr.add_xpath("max_time", "maxplaytime/@value")
            ldr.add_xpath("max_time", "playingtime/@value")
            ldr.add_xpath("max_time", "minplaytime/@value")

            ldr.add_value(
                "game_type",
                _value_id_rank(
                    game.xpath(
                        'statistics/ratings/ranks/rank[@type = "family"]')),
            )
            ldr.add_value(
                "category",
                _value_id(game.xpath('link[@type = "boardgamecategory"]')))
            ldr.add_value(
                "mechanic",
                _value_id(game.xpath('link[@type = "boardgamemechanic"]')))
            # look for <link type="boardgamemechanic" id="2023" value="Co-operative Play" />
            ldr.add_value(
                "cooperative",
                bool(
                    game.xpath(
                        'link[@type = "boardgamemechanic" and @id = "2023"]')),
            )
            ldr.add_value(
                "compilation",
                bool(
                    game.xpath(
                        'link[@type = "boardgamecompilation" and @inbound = "true"]'
                    )),
            )
            ldr.add_xpath(
                "compilation_of",
                'link[@type = "boardgamecompilation" and @inbound = "true"]/@id',
            )
            ldr.add_value(
                "family",
                _value_id(game.xpath('link[@type = "boardgamefamily"]')))
            ldr.add_value(
                "expansion",
                _value_id(game.xpath('link[@type = "boardgameexpansion"]')))
            ldr.add_xpath(
                "implementation",
                'link[@type = "boardgameimplementation" and @inbound = "true"]/@id',
            )
            ldr.add_xpath("integration",
                          'link[@type = "boardgameintegration"]/@id')

            ldr.add_xpath(
                "rank",
                'statistics/ratings/ranks/rank[@name = "boardgame"]/@value')
            ldr.add_xpath("num_votes", "statistics/ratings/usersrated/@value")
            ldr.add_xpath("avg_rating", "statistics/ratings/average/@value")
            ldr.add_xpath("stddev_rating", "statistics/ratings/stddev/@value")
            ldr.add_xpath("bayes_rating",
                          "statistics/ratings/bayesaverage/@value")
            ldr.add_xpath("complexity",
                          "statistics/ratings/averageweight/@value")
            ldr.add_value(
                "language_dependency",
                self._poll(
                    game,
                    "language_dependence",
                    attr="level",
                    enum=True,
                    func=statistics.median_grouped,
                ),
            )

            for rank in game.xpath(
                    'statistics/ratings/ranks/rank[@type = "family"]'):
                add_rank = {
                    "game_type":
                    rank.xpath("@name").extract_first(),
                    "game_type_id":
                    parse_int(rank.xpath("@id").extract_first()),
                    "name":
                    _remove_rank(rank.xpath("@friendlyname").extract_first()),
                    "rank":
                    parse_int(rank.xpath("@value").extract_first()),
                    "bayes_rating":
                    parse_float(rank.xpath("@bayesaverage").extract_first()),
                }
                ldr.add_value("add_rank", add_rank)

            yield ldr.load_item()
def abs_comp(field_1, field_2):
    """ returns absolute value of difference if both arguments are valid, else inf """
    field_1 = parse_float(field_1)
    field_2 = parse_float(field_2)
    return math.inf if field_1 is None or field_2 is None else abs(field_1 -
                                                                   field_2)