コード例 #1
0
def parse_version(version):
    """Parse a version string to strip leading "v" etc."""
    version = normalize_space(version)
    if not version:
        return None
    match = VERSION_REGEX.match(version)
    return match.group(1) if match else None
コード例 #2
0
    def parse_games(self, response):
        # pylint: disable=line-too-long
        """
        @url http://dbpedia.org/sparql?query=SELECT+DISTINCT+%3Fgame+WHERE+%7B+%3Fgame+a+%3Chttp%3A%2F%2Fdbpedia.org%2Fclass%2Fyago%2FBoardGame100502415%3E+.+%7D&format=text%2Fxml
        @returns items 0 0
        @returns requests 1200
        """

        response.selector.register_namespace(
            "s", "http://www.w3.org/2005/sparql-results#")

        games = response.xpath(
            '//s:binding[@name = "game"]/s:uri/text()').extract()

        self.logger.info("received %d games", len(games))

        query_tmpl = normalize_space("""
            SELECT ?property ?value ?label WHERE {{
                <{game}> ?property ?value .
                OPTIONAL {{ ?value <http://www.w3.org/2000/01/rdf-schema#label> ?label . }}
            }}""")

        for game in games:
            # dbpedia_id = game.split('/')[-1]
            # http://dbpedia.org/resource/{dbpedia_id}
            query = query_tmpl.format(game=game)
            # self.logger.debug(query)
            yield Request(
                self._api_url(query),
                callback=self.parse_game,
                meta={"dbpedia_uri": game},
            )
コード例 #3
0
    def _type_requests(self, types, batch_size=10):
        query_tmpl = normalize_space("""
            SELECT DISTINCT ?game WHERE {{
                ?game <http://www.wikidata.org/prop/direct/P31> ?type .
                VALUES ?type {{ {} }}
            }}""")

        if not batch_size:
            query = query_tmpl.format(" ".join(types))
            # self.logger.debug(query)
            yield Request(
                self.sparql_api_url,
                method="POST",
                body=urlencode({"query": query}),
                callback=self.parse_games,
                priority=1,
            )
            return

        for batch in batchify(types, batch_size):
            query = query_tmpl.format(" ".join(batch))
            # self.logger.debug(query)
            yield Request(self._api_url(query),
                          callback=self.parse_games,
                          priority=1)
コード例 #4
0
def model_updated_at(file_path=settings.MODEL_UPDATED_FILE):
    """latest model update"""
    try:
        with open(file_path) as file_obj:
            updated_at = file_obj.read()
        updated_at = normalize_space(updated_at)
        return parse_date(updated_at, tzinfo=timezone.utc)
    except Exception:
        pass
    return None
コード例 #5
0
def _parse_int(element, xpath, default=None, lenient=False):
    if not element or not xpath:
        return default

    string = normalize_space(element.xpath(xpath).extract_first())

    if not string:
        return default

    result = parse_int(string)

    if result is None and lenient:
        match = DIGITS_REGEX.match(string)
        result = parse_int(match.group(1)) if match else None

    return result if result is not None else default
コード例 #6
0
def _parse_int(element, xpath=None, css=None, default=None, lenient=False):
    if not element or (not xpath and not css):
        return default

    selected = element.xpath(xpath) if xpath else element.css(css)
    string = normalize_space(selected.extract_first())

    if not string:
        return default

    result = parse_int(string)

    if result is None and lenient:
        match = DIGITS_REGEX.match(string)
        result = parse_int(match.group(1)) if match else None

    return result if result is not None else default
コード例 #7
0
    def start_requests(self):
        """ generate start requests """

        types = getattr(self, "game_types", None)

        if types:
            yield from self._type_requests(
                "<http://www.wikidata.org/entity/{}>".format(t) for t in types)
            return

        query = normalize_space("""
            SELECT DISTINCT ?type WHERE {
                ?game <http://www.wikidata.org/prop/direct/P2339> ?bgg ;
                      <http://www.wikidata.org/prop/direct/P31> ?type .
            }""")
        # self.logger.debug(query)
        yield Request(self._api_url(query), callback=self.parse, priority=2)
コード例 #8
0
ファイル: bgg.py プロジェクト: prabhjotSL/board-game-scraper
def _parse_player_count(poll):
    for result in poll.xpath("results"):
        numplayers = normalize_space(result.xpath("@numplayers").extract_first())
        players = parse_int(numplayers)

        if not players and numplayers.endswith("+"):
            players = parse_int(numplayers[:-1]) or -1
            players += 1

        if not players:
            continue

        votes_best = _parse_int(result, 'result[@value = "Best"]/@numvotes', 0)
        votes_rec = _parse_int(result, 'result[@value = "Recommended"]/@numvotes', 0)
        votes_not = _parse_int(
            result, 'result[@value = "Not Recommended"]/@numvotes', 0
        )

        yield players, votes_best, votes_rec, votes_not
コード例 #9
0
    def start_requests(self):
        """ generate start requests """

        dbp_types = getattr(self, "game_types", None) or ()
        wd_types = getattr(WikidataSpider, "game_types", None) or ()
        types = tuple("<{}>".format(t) for t in dbp_types) + tuple(
            "<http://www.wikidata.org/entity/{}>".format(t) for t in wd_types)

        if types:
            yield from self._type_requests(types)
            return

        query = normalize_space("""
            SELECT DISTINCT ?type WHERE {
                ?game <http://dbpedia.org/property/bggid> ?bgg;
                      a ?type .
            }""")
        # self.logger.debug(query)
        yield Request(self._api_url(query), callback=self.parse, priority=2)
コード例 #10
0
def date_from_file(
    path: Union[bytes, str, os.PathLike, None],
    tzinfo: Optional[timezone] = None,
    format_str: Optional[str] = None,
) -> Optional[datetime]:
    """Parse a date from a file."""

    if not path:
        return None

    path = Path(path).resolve()
    LOGGER.info("Reading date from path <%s>", path)

    try:
        with path.open() as file_obj:
            date = normalize_space(next(file_obj, None))
    except Exception:
        date = None

    return parse_date(date=date, tzinfo=tzinfo, format_str=format_str)
コード例 #11
0
def _process_messages(messages, output, header=False, encoding="utf-8"):
    writer = csv.writer(output)

    if header:
        writer.writerow(("date", "user"))

    for message in messages:
        try:
            date = message.message.publish_time.replace(
                nanosecond=0).isoformat()
            user = normalize_space(
                message.message.data.decode(encoding)).lower()
            if date and user:
                writer.writerow((date, user))
                yield message.ack_id
            else:
                LOGGER.error("there was a problem processing message %r",
                             message)

        except Exception:
            LOGGER.exception("unable to process message %r", message)
コード例 #12
0
def _find_states(
    path_dir,
    state_file=".state",
    delete="finished",
    delete_non_state=False,
):
    path_dir = Path(path_dir).resolve()
    delete = frozenset(arg_to_iter(delete))
    result = {}

    if not path_dir.is_dir():
        LOGGER.warning("<%s> is not an existing dir", path_dir)
        return result

    LOGGER.info("Finding jobs and their states in <%s>", path_dir)

    for sub_dir in path_dir.iterdir():
        state_path = sub_dir / state_file

        if not sub_dir.is_dir() or not state_path.is_file():
            continue

        try:
            with state_path.open() as file_obj:
                state = normalize_space(next(file_obj, None))
        except Exception:
            LOGGER.exeception("Unable to read a state from <%s>", state_path)
            state = None

        if not state:
            LOGGER.warning("No valid state file in <%s>", sub_dir)

        if state in delete or (delete_non_state and not state):
            LOGGER.info("Deleting <%s> with state <%s>", sub_dir, state)
            rmtree(sub_dir, ignore_errors=True)
        elif state:
            result[sub_dir.name] = state

    return result
コード例 #13
0
def main():
    """Command line entry point."""

    settings = get_project_settings()
    configure_logging(settings)

    args = _parse_args()
    LOGGER.info(args)

    base_dir = Path(settings["BASE_DIR"]).resolve()
    cache_dir = base_dir / ".scrapy" / "httpcache"
    feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds"
    feeds_dir = feeds_dir.resolve()
    feeds_dir_scraper = (
        feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider
    )
    file_tag = normalize_space(args.file_tag)
    out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl"

    LOGGER.info("Output file will be <%s>", out_file)

    from_settings = job_dir_from_settings(settings)
    job_dir = (
        Path(args.job_dir)
        if args.job_dir
        else Path(from_settings)
        if from_settings
        else base_dir / "jobs" / args.spider
    )
    job_dir = job_dir.resolve()

    cache_dir.mkdir(parents=True, exist_ok=True)
    feeds_dir_scraper.mkdir(parents=True, exist_ok=True)
    job_dir.mkdir(parents=True, exist_ok=True)

    dont_run_before_file = job_dir / ".dont_run_before"
    dont_run_before = parse_date(
        args.dont_run_before, tzinfo=timezone.utc
    ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc)

    if dont_run_before:
        LOGGER.info("Don't run before %s", dont_run_before.isoformat())
        sleep_seconds = dont_run_before.timestamp() - now().timestamp()
        if sleep_seconds > 0:
            LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds)
            sleep(sleep_seconds)

    states = _find_states(
        job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state"
    )

    running = sorted(sub_dir for sub_dir, state in states.items() if state == "running")

    if len(running) > 1:
        LOGGER.warning(
            "Found %d running jobs %s, please check and fix!", len(running), running
        )
        return

    if running:
        LOGGER.info("Found a running job <%s>, skipping...", running[0])
        return

    resumable = sorted(
        sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES
    )

    if len(resumable) > 1:
        LOGGER.warning(
            "Found %d resumable jobs %s, please check and fix!",
            len(resumable),
            resumable,
        )
        return

    if resumable:
        LOGGER.info("Resuming previous job <%s>", resumable[0])

    job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT)
    curr_job = job_dir / job_tag

    command = [
        "scrapy",
        "crawl",
        args.spider,
        "--output",
        str(out_file),
        "--set",
        f"JOBDIR={curr_job}",
        "--set",
        f"DONT_RUN_BEFORE_FILE={dont_run_before_file}",
    ]

    try:
        execute(argv=command)
    finally:
        garbage_collect()