def _parse_game(game): for field in DEDUPE_FIELDS: game.setdefault(field["field"], None) if field["type"] == "Set": game[field["field"]] = tuple(arg_to_iter( game[field["field"]])) or None game["names"] = tuple( clear_list( chain(arg_to_iter(game.get("name")), arg_to_iter(game.get("alt_name"))))) for field in VALUE_ID_FIELDS: game[field] = tuple( clear_list(map(_parse_value_id, arg_to_iter(game.get(field))))) return game
def parse(self, response): """ @url https://boardgamegeek.com/browse/boardgame/ @returns items 0 0 @returns requests 11 """ next_page = response.xpath( '//a[@title = "next page"]/@href').extract_first() if next_page: yield Request( response.urljoin(next_page), callback=self.parse, priority=1, meta={"max_retry_times": 10}, ) urls = response.xpath("//@href").extract() bgg_ids = filter(None, map(extract_bgg_id, map(response.urljoin, urls))) yield from self._game_requests(*bgg_ids) user_names = filter(None, map(extract_bgg_user_name, urls)) scraped_at = now() for user_name in clear_list(user_names): yield self.collection_request( user_name ) if self.scrape_collections else self._user_item_or_request( user_name, scraped_at=scraped_at)
def process_item(self, item, spider): """ resolve resource image URLs to actual file locations """ for field in self.fields: if item.get(field): item[field] = clear_list( map(self._parse_url, arg_to_iter(item[field]))) return item
def dates(self, request): """Find all available dates with rankings.""" query_set = self.get_queryset().order_by("ranking_type", "date") ranking_types = clear_list(_extract_params(request, "ranking_type")) if ranking_types: query_set = query_set.filter(ranking_type__in=ranking_types) return Response(query_set.values("ranking_type", "date").distinct())
def extract_ids(*urls: Optional[str]) -> Dict[str, List[Union[int, str]]]: """ extract all possible IDs from all the URLs """ urls = tuple(map(urlparse, urls)) return { "bgg_id": clear_list(map(extract_bgg_id, urls)), "freebase_id": clear_list(map(extract_freebase_id, urls)), "wikidata_id": clear_list(map(extract_wikidata_id, urls)), "wikipedia_id": clear_list(map(extract_wikipedia_id, urls)), "dbpedia_id": clear_list(map(extract_dbpedia_id, urls)), "luding_id": clear_list(map(extract_luding_id, urls)), "spielen_id": clear_list(map(extract_spielen_id, urls)), "bga_id": clear_list(map(extract_bga_id, urls)), }
def _process_df(data_frame, columns=None, required_columns=None, target_column=None): if data_frame is None or data_frame.empty: LOGGER.error("DataFrame is empty") return None columns = clear_list(arg_to_iter(columns)) required_columns = clear_list(arg_to_iter(required_columns)) or columns columns = clear_list(columns + required_columns) if not columns: LOGGER.error("No columns given") return None missing_columns = [ column for column in required_columns if column not in data_frame ] if missing_columns: LOGGER.error("DataFrame does not contain the expected columns %s", missing_columns) return None for column in columns: if column not in data_frame: data_frame[column] = None target_column = target_column or columns[0] return (data_frame[columns][data_frame[target_column].notna()].sort_values( target_column).rename(columns={ "bayes_rating": "score" }).astype({ "rank": int, "bgg_id": int }))
def _extract_labels(self, response, value): json_obj = parse_json(response.text) if hasattr(response, "text") else {} labels = take_first(jmespath.search(f"entities.{value}.labels", json_obj)) or {} labels = labels.values() labels = sorted( labels, key=lambda label: self.lang_priorities.get(label.get("language"), math.inf), ) labels = clear_list(label.get("value") for label in labels) self.labels[value] = labels self.logger.debug("resolved labels for %s: %s", value, labels) return labels
def _game_requests(self, *bgg_ids, batch_size=10, page=1, priority=0, **kwargs): bgg_ids = clear_list(map(parse_int, bgg_ids)) if not bgg_ids: return bgg_ids = ((bgg_id for bgg_id in bgg_ids if bgg_id not in self._ids_seen) if page == 1 else bgg_ids) for batch in batchify(bgg_ids, batch_size): batch = tuple(batch) ids = ",".join(map(str, batch)) url = (self._api_url( action="thing", id=ids, stats=1, videos=1, versions=int(self.scrape_ratings), ratingcomments=int(self.scrape_ratings), page=1, ) if page == 1 else self._api_url(action="thing", id=ids, versions=1, ratingcomments=1, page=page)) request = Request(url, callback=self.parse_game, priority=priority) if len(batch) == 1: request.meta["bgg_id"] = batch[0] request.meta["page"] = page request.meta.update(kwargs) yield request if page == 1: self._ids_seen.update(batch)
def rankings(self, request, pk=None): """Find historical rankings of a game.""" filters = { "game": pk, "ranking_type__in": clear_list(_extract_params(request, "ranking_type")), "date__gte": parse_date( request.query_params.get("date__gte"), tzinfo=timezone.utc ), "date__lte": parse_date( request.query_params.get("date__lte"), tzinfo=timezone.utc ), } filters = {k: v for k, v in filters.items() if v} queryset = Ranking.objects.filter(**filters) serializer = RankingSerializer( queryset, many=True, context=self.get_serializer_context() ) return Response(serializer.data)
def _add_value(self, result, field, item): labels = clear_list(flatten(r[1] for r in arg_to_iter(result))) or None self.logger.debug("resolved labels for %s: %s", item.get(field), labels) item[field] = labels return item
def merge_files( in_paths, out_path, keys="id", key_types=None, latest=None, latest_types=None, latest_min=None, latest_required=False, fieldnames=None, fieldnames_exclude=None, sort_keys=False, sort_latest=False, sort_fields=None, sort_descending=False, concat_output=False, log_level=None, ): """ merge files into one """ spark = _spark_session(log_level=log_level) if spark is None: raise RuntimeError( "Please make sure Spark is installed and configured correctly!") in_paths = list(map(str, arg_to_iter(in_paths))) LOGGER.info( "Merging items from %s into <%s> with Spark session %r", f"[{len(in_paths) } paths]" if len(in_paths) > 10 else in_paths, out_path, spark, ) fieldnames = clear_list(arg_to_iter(fieldnames)) fieldnames_exclude = frozenset(arg_to_iter(fieldnames_exclude)) if fieldnames and fieldnames_exclude: LOGGER.warning( "Both <fieldnames> and <fieldnames_exclude> were specified, please choose one" ) sort_fields = tuple(arg_to_iter(sort_fields)) if sum(map(bool, (sort_keys, sort_latest, sort_fields))) > 1: LOGGER.warning( "Only use at most one of <sort_keys>, <sort_latest>, and <sort_fields>" ) keys = tuple(arg_to_iter(keys)) key_types = tuple(arg_to_iter(key_types)) key_types += (None, ) * (len(keys) - len(key_types)) assert len(keys) == len(key_types) LOGGER.info("Using keys %s with types %s", keys, key_types) latest = tuple(arg_to_iter(latest)) latest_types = tuple(arg_to_iter(latest_types)) latest_types += (None, ) * (len(latest) - len(latest_types)) assert len(latest) == len(latest_types) LOGGER.info("Using latest %s with types %s", latest, latest_types) data = spark.read.json(path=in_paths, mode="DROPMALFORMED", dropFieldIfAllNull=True) key_column_names = [f"_key_{i}" for i in range(len(keys))] key_columns = [ _column_type(data[column], column_type).alias(name) for column, column_type, name in zip(keys, key_types, key_column_names) ] key_columns_str = (column.cast("string") for column in key_columns) latest_column_names = [f"_latest_{i}" for i in range(len(latest))] latest_columns = [ _column_type(data[column], column_type).alias(name) for column, column_type, name in zip(latest, latest_types, latest_column_names) ] latest_columns_str = (column.cast("string") for column in latest_columns) drop_subset = keys + tuple(key_column_names) if latest_required: drop_subset += latest + tuple(latest_column_names) LOGGER.info("Dropping rows without values in columns %s", drop_subset) data = data.select( "*", *key_columns, array(*key_columns_str).alias("_key"), *latest_columns, array(*latest_columns_str).alias("_latest"), ).dropna(how="any", subset=drop_subset) if latest_min is not None: LOGGER.info("Filter out items before %s", latest_min) data = data.filter(latest_columns[0] >= latest_min) rdd = (data.rdd.keyBy(lambda row: tuple(arg_to_iter(row["_key"]))). reduceByKey(_compare).values()) data = rdd.toDF(schema=data.schema) if sort_keys: LOGGER.info( "Sorting %s by columns %s", "descending" if sort_descending else "ascending", keys, ) data = data.sort(*key_column_names, ascending=not sort_descending) elif sort_latest: LOGGER.info( "Sorting %s by columns %s", "descending" if sort_descending else "ascending", latest, ) data = data.sort(*latest_column_names, ascending=not sort_descending) elif sort_fields: LOGGER.info( "Sorting %s by columns %s", "descending" if sort_descending else "ascending", sort_fields, ) data = data.sort(*sort_fields, ascending=not sort_descending) data = data.drop("_key", *key_column_names, "_latest", *latest_column_names) columns = frozenset(data.columns) - fieldnames_exclude if fieldnames: fieldnames = [column for column in fieldnames if column in columns] LOGGER.info("Only use columns: %s", fieldnames) else: fieldnames = sorted(columns) LOGGER.info("Use sorted column names: %s", fieldnames) data = data.select(*fieldnames) data = _remove_empty(data) if concat_output: with tempfile.TemporaryDirectory() as temp_path: path = Path(temp_path) / "out" LOGGER.info("Saving temporary output to <%s>", path) data.write.json(path=str(path)) LOGGER.info("Concatenate temporary files to <%s>", out_path) files = path.glob("part-*") concat_files(dst=out_path, srcs=sorted(files), ensure_newline=True) else: LOGGER.info("Saving output to <%s>", out_path) data.write.json(path=str(out_path)) LOGGER.info("Done merging.")