def parse_version(version): """Parse a version string to strip leading "v" etc.""" version = normalize_space(version) if not version: return None match = VERSION_REGEX.match(version) return match.group(1) if match else None
def parse_games(self, response): # pylint: disable=line-too-long """ @url http://dbpedia.org/sparql?query=SELECT+DISTINCT+%3Fgame+WHERE+%7B+%3Fgame+a+%3Chttp%3A%2F%2Fdbpedia.org%2Fclass%2Fyago%2FBoardGame100502415%3E+.+%7D&format=text%2Fxml @returns items 0 0 @returns requests 1200 """ response.selector.register_namespace( "s", "http://www.w3.org/2005/sparql-results#") games = response.xpath( '//s:binding[@name = "game"]/s:uri/text()').extract() self.logger.info("received %d games", len(games)) query_tmpl = normalize_space(""" SELECT ?property ?value ?label WHERE {{ <{game}> ?property ?value . OPTIONAL {{ ?value <http://www.w3.org/2000/01/rdf-schema#label> ?label . }} }}""") for game in games: # dbpedia_id = game.split('/')[-1] # http://dbpedia.org/resource/{dbpedia_id} query = query_tmpl.format(game=game) # self.logger.debug(query) yield Request( self._api_url(query), callback=self.parse_game, meta={"dbpedia_uri": game}, )
def _type_requests(self, types, batch_size=10): query_tmpl = normalize_space(""" SELECT DISTINCT ?game WHERE {{ ?game <http://www.wikidata.org/prop/direct/P31> ?type . VALUES ?type {{ {} }} }}""") if not batch_size: query = query_tmpl.format(" ".join(types)) # self.logger.debug(query) yield Request( self.sparql_api_url, method="POST", body=urlencode({"query": query}), callback=self.parse_games, priority=1, ) return for batch in batchify(types, batch_size): query = query_tmpl.format(" ".join(batch)) # self.logger.debug(query) yield Request(self._api_url(query), callback=self.parse_games, priority=1)
def model_updated_at(file_path=settings.MODEL_UPDATED_FILE): """latest model update""" try: with open(file_path) as file_obj: updated_at = file_obj.read() updated_at = normalize_space(updated_at) return parse_date(updated_at, tzinfo=timezone.utc) except Exception: pass return None
def _parse_int(element, xpath, default=None, lenient=False): if not element or not xpath: return default string = normalize_space(element.xpath(xpath).extract_first()) if not string: return default result = parse_int(string) if result is None and lenient: match = DIGITS_REGEX.match(string) result = parse_int(match.group(1)) if match else None return result if result is not None else default
def _parse_int(element, xpath=None, css=None, default=None, lenient=False): if not element or (not xpath and not css): return default selected = element.xpath(xpath) if xpath else element.css(css) string = normalize_space(selected.extract_first()) if not string: return default result = parse_int(string) if result is None and lenient: match = DIGITS_REGEX.match(string) result = parse_int(match.group(1)) if match else None return result if result is not None else default
def start_requests(self): """ generate start requests """ types = getattr(self, "game_types", None) if types: yield from self._type_requests( "<http://www.wikidata.org/entity/{}>".format(t) for t in types) return query = normalize_space(""" SELECT DISTINCT ?type WHERE { ?game <http://www.wikidata.org/prop/direct/P2339> ?bgg ; <http://www.wikidata.org/prop/direct/P31> ?type . }""") # self.logger.debug(query) yield Request(self._api_url(query), callback=self.parse, priority=2)
def _parse_player_count(poll): for result in poll.xpath("results"): numplayers = normalize_space(result.xpath("@numplayers").extract_first()) players = parse_int(numplayers) if not players and numplayers.endswith("+"): players = parse_int(numplayers[:-1]) or -1 players += 1 if not players: continue votes_best = _parse_int(result, 'result[@value = "Best"]/@numvotes', 0) votes_rec = _parse_int(result, 'result[@value = "Recommended"]/@numvotes', 0) votes_not = _parse_int( result, 'result[@value = "Not Recommended"]/@numvotes', 0 ) yield players, votes_best, votes_rec, votes_not
def start_requests(self): """ generate start requests """ dbp_types = getattr(self, "game_types", None) or () wd_types = getattr(WikidataSpider, "game_types", None) or () types = tuple("<{}>".format(t) for t in dbp_types) + tuple( "<http://www.wikidata.org/entity/{}>".format(t) for t in wd_types) if types: yield from self._type_requests(types) return query = normalize_space(""" SELECT DISTINCT ?type WHERE { ?game <http://dbpedia.org/property/bggid> ?bgg; a ?type . }""") # self.logger.debug(query) yield Request(self._api_url(query), callback=self.parse, priority=2)
def date_from_file( path: Union[bytes, str, os.PathLike, None], tzinfo: Optional[timezone] = None, format_str: Optional[str] = None, ) -> Optional[datetime]: """Parse a date from a file.""" if not path: return None path = Path(path).resolve() LOGGER.info("Reading date from path <%s>", path) try: with path.open() as file_obj: date = normalize_space(next(file_obj, None)) except Exception: date = None return parse_date(date=date, tzinfo=tzinfo, format_str=format_str)
def _process_messages(messages, output, header=False, encoding="utf-8"): writer = csv.writer(output) if header: writer.writerow(("date", "user")) for message in messages: try: date = message.message.publish_time.replace( nanosecond=0).isoformat() user = normalize_space( message.message.data.decode(encoding)).lower() if date and user: writer.writerow((date, user)) yield message.ack_id else: LOGGER.error("there was a problem processing message %r", message) except Exception: LOGGER.exception("unable to process message %r", message)
def _find_states( path_dir, state_file=".state", delete="finished", delete_non_state=False, ): path_dir = Path(path_dir).resolve() delete = frozenset(arg_to_iter(delete)) result = {} if not path_dir.is_dir(): LOGGER.warning("<%s> is not an existing dir", path_dir) return result LOGGER.info("Finding jobs and their states in <%s>", path_dir) for sub_dir in path_dir.iterdir(): state_path = sub_dir / state_file if not sub_dir.is_dir() or not state_path.is_file(): continue try: with state_path.open() as file_obj: state = normalize_space(next(file_obj, None)) except Exception: LOGGER.exeception("Unable to read a state from <%s>", state_path) state = None if not state: LOGGER.warning("No valid state file in <%s>", sub_dir) if state in delete or (delete_non_state and not state): LOGGER.info("Deleting <%s> with state <%s>", sub_dir, state) rmtree(sub_dir, ignore_errors=True) elif state: result[sub_dir.name] = state return result
def main(): """Command line entry point.""" settings = get_project_settings() configure_logging(settings) args = _parse_args() LOGGER.info(args) base_dir = Path(settings["BASE_DIR"]).resolve() cache_dir = base_dir / ".scrapy" / "httpcache" feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds" feeds_dir = feeds_dir.resolve() feeds_dir_scraper = ( feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider ) file_tag = normalize_space(args.file_tag) out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl" LOGGER.info("Output file will be <%s>", out_file) from_settings = job_dir_from_settings(settings) job_dir = ( Path(args.job_dir) if args.job_dir else Path(from_settings) if from_settings else base_dir / "jobs" / args.spider ) job_dir = job_dir.resolve() cache_dir.mkdir(parents=True, exist_ok=True) feeds_dir_scraper.mkdir(parents=True, exist_ok=True) job_dir.mkdir(parents=True, exist_ok=True) dont_run_before_file = job_dir / ".dont_run_before" dont_run_before = parse_date( args.dont_run_before, tzinfo=timezone.utc ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc) if dont_run_before: LOGGER.info("Don't run before %s", dont_run_before.isoformat()) sleep_seconds = dont_run_before.timestamp() - now().timestamp() if sleep_seconds > 0: LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds) sleep(sleep_seconds) states = _find_states( job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state" ) running = sorted(sub_dir for sub_dir, state in states.items() if state == "running") if len(running) > 1: LOGGER.warning( "Found %d running jobs %s, please check and fix!", len(running), running ) return if running: LOGGER.info("Found a running job <%s>, skipping...", running[0]) return resumable = sorted( sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES ) if len(resumable) > 1: LOGGER.warning( "Found %d resumable jobs %s, please check and fix!", len(resumable), resumable, ) return if resumable: LOGGER.info("Resuming previous job <%s>", resumable[0]) job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT) curr_job = job_dir / job_tag command = [ "scrapy", "crawl", args.spider, "--output", str(out_file), "--set", f"JOBDIR={curr_job}", "--set", f"DONT_RUN_BEFORE_FILE={dont_run_before_file}", ] try: execute(argv=command) finally: garbage_collect()