def _create_from_items( model, items, fields=None, fields_mapping=None, item_mapping=None, add_data=None, batch_size=None, dry_run=False, ): LOGGER.info("creating instances of %r", model) instances = _make_instances( model=model, items=items, fields=fields, fields_mapping=fields_mapping, item_mapping=item_mapping, add_data=add_data, ) batches = batchify(instances, batch_size) if batch_size else (instances, ) for count, batch in enumerate(batches): LOGGER.info("processing batch #%d...", count + 1) if not dry_run: model.objects.bulk_create(batch) LOGGER.info("done processing")
def handle(self, *args, **kwargs): logging.basicConfig( stream=sys.stderr, level=logging.DEBUG if kwargs["verbosity"] > 1 else logging.INFO, format= "%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s", ) LOGGER.info(kwargs) # pylint: disable=no-member game_ids = frozenset(Game.objects.order_by().values_list("bgg_id", flat=True)) instances = self._create_all_instances( path=kwargs["path"], filter_ids=game_ids, week_day=kwargs["week_day"], types=kwargs["types"], ) batches = (batchify(instances, kwargs["batch"]) if kwargs["batch"] else (instances, )) for count, batch in enumerate(batches): LOGGER.info("Processing batch #%d...", count + 1) if kwargs["dry_run"]: for item in batch: print(item) else: Ranking.objects.bulk_create(batch) LOGGER.info("Done filling the database.")
def _type_requests(self, types, batch_size=10): query_tmpl = normalize_space(""" SELECT DISTINCT ?game WHERE {{ ?game <http://www.wikidata.org/prop/direct/P31> ?type . VALUES ?type {{ {} }} }}""") if not batch_size: query = query_tmpl.format(" ".join(types)) # self.logger.debug(query) yield Request( self.sparql_api_url, method="POST", body=urlencode({"query": query}), callback=self.parse_games, priority=1, ) return for batch in batchify(types, batch_size): query = query_tmpl.format(" ".join(batch)) # self.logger.debug(query) yield Request(self._api_url(query), callback=self.parse_games, priority=1)
def _type_requests(self, types, batch_size=10): query_tmpl = ( "SELECT DISTINCT ?game WHERE {{ ?game a ?type . VALUES ?type {{ {} }} }}" ) for batch in batchify(types, batch_size): query = query_tmpl.format(" ".join(batch)) # self.logger.debug(query) yield Request(self._api_url(query), callback=self.parse_games, priority=1)
def _create_secondary_instances( model, secondary, items, models_order=(), batch_size=None, dry_run=False, **kwargs, ): instances = _make_secondary_instances( model=model, secondary=secondary, items=items, **kwargs, ) del items batches = batchify(instances, batch_size) if batch_size else (instances, ) del instances models_order = tuple(arg_to_iter(models_order)) for count, batch in enumerate(batches): LOGGER.info("processing batch #%d...", count + 1) models = defaultdict(list) for instance in batch: models[type(instance)].append(instance) order = models_order or tuple(models.keys()) del batch for mdl in order: instances = models.pop(mdl, ()) if not dry_run and instances: LOGGER.info("creating %d instances of %r", len(instances), mdl) mdl.objects.bulk_create(instances) if any(models.values()): LOGGER.warning( "some models have not been processed properly: %r", tuple(models.keys()), ) del models del batches LOGGER.info("done processing")
def split_files(path_in, path_out=None, size=None, fields=FIELDS, exclude_empty=False): """Split a JSON lines file into JSON files of a given size.""" path_in = Path(path_in).resolve() path_out = "-" if path_out is None or path_out == "-" else Path( path_out).resolve() LOGGER.info("Reading items from <%s> splitting them into <%s>", path_in, path_out) if path_out != "-": path_out.parent.mkdir(parents=True, exist_ok=True) items = tuple( _load_items(path_in, fields=fields, exclude_empty=exclude_empty)) batches = batchify(items, size) if size else (items, ) total = len(items) count = 0 LOGGER.info("Read %d items from <%s>", total, path_in) for i, batch in enumerate(batches): batch = list(batch) count += len(batch) result = { "count": total, "previous": i - 1 if i else None, "next": i + 1 if count < total else None, "results": batch, } if path_out == "-": json.dump(result, sys.stdout, sort_keys=True) print() else: out_path = str(path_out).format(number=i) LOGGER.info("Writing batch #%d to <%s>", i, out_path) with open(out_path, "w") as out_file: json.dump(result, out_file, sort_keys=True) LOGGER.info("Done splitting.")
def _game_requests(self, *bgg_ids, batch_size=10, page=1, priority=0, **kwargs): bgg_ids = clear_list(map(parse_int, bgg_ids)) if not bgg_ids: return bgg_ids = ((bgg_id for bgg_id in bgg_ids if bgg_id not in self._ids_seen) if page == 1 else bgg_ids) for batch in batchify(bgg_ids, batch_size): batch = tuple(batch) ids = ",".join(map(str, batch)) url = (self._api_url( action="thing", id=ids, stats=1, videos=1, versions=int(self.scrape_ratings), ratingcomments=int(self.scrape_ratings), page=1, ) if page == 1 else self._api_url(action="thing", id=ids, versions=1, ratingcomments=1, page=page)) request = Request(url, callback=self.parse_game, priority=priority) if len(batch) == 1: request.meta["bgg_id"] = batch[0] request.meta["page"] = page request.meta.update(kwargs) yield request if page == 1: self._ids_seen.update(batch)
def _create_references( model, items, foreign=None, recursive=None, batch_size=None, dry_run=False, ): foreign = foreign or {} foreign = {k: tuple(arg_to_iter(v)) for k, v in foreign.items()} foreign = {k: v for k, v in foreign.items() if len(v) == 2} recursive = ({r: r for r in arg_to_iter(recursive)} if not isinstance(recursive, dict) else recursive) if not foreign and not recursive: LOGGER.warning( "neither foreign nor recursive references given, got nothing to do..." ) return LOGGER.info("creating foreign references: %r", foreign) LOGGER.info("creating recursive references: %r", recursive) count = -1 foreign_values = {f[0]: defaultdict(set) for f in foreign.values()} updates = {} for count, item in enumerate(items): update = defaultdict(list) for field, (fmodel, _) in foreign.items(): for value in filter( None, map(_parse_value_id, arg_to_iter(item.get(field)))): id_ = value.get("id") value = value.get("value") if id_ and value: foreign_values[fmodel][id_].add(value) update[field].append(id_) for rec_from, rec_to in recursive.items(): rec = {parse_int(r) for r in arg_to_iter(item.get(rec_from)) if r} rec = (sorted( model.objects.filter(pk__in=rec).values_list( "pk", flat=True).distinct()) if rec else None) if rec: update[rec_to] = rec pkey = parse_int(item.get(model._meta.pk.name)) if pkey and any(update.values()): updates[pkey] = update if (count + 1) % 1000 == 0: LOGGER.info("processed %d items so far", count + 1) del items, recursive LOGGER.info("processed %d items in total", count + 1) for fmodel, value_field in frozenset(foreign.values()): id_field = fmodel._meta.pk.name LOGGER.info("found %d items for model %r to create", len(foreign_values[fmodel]), fmodel) values = ({ id_field: k, value_field: take_first(v) } for k, v in foreign_values[fmodel].items() if k and v) _create_from_items( model=fmodel, items=values, batch_size=batch_size, dry_run=dry_run, ) del foreign, foreign_values LOGGER.info("found %d items for model %r to update", len(updates), model) batches = (batchify(updates.items(), batch_size) if batch_size else (updates.items(), )) for count, batch in enumerate(batches): LOGGER.info("processing batch #%d...", count + 1) if not dry_run: with atomic(): for pkey, update in batch: try: instance = model.objects.get(pk=pkey) for field, values in update.items(): getattr(instance, field).set(values) instance.save() except Exception: LOGGER.exception( "an error ocurred when updating <%s> with %r", pkey, update, ) del batches, updates LOGGER.info("done updating")