def _create_from_items(
    model,
    items,
    fields=None,
    fields_mapping=None,
    item_mapping=None,
    add_data=None,
    batch_size=None,
    dry_run=False,
):
    LOGGER.info("creating instances of %r", model)

    instances = _make_instances(
        model=model,
        items=items,
        fields=fields,
        fields_mapping=fields_mapping,
        item_mapping=item_mapping,
        add_data=add_data,
    )

    batches = batchify(instances, batch_size) if batch_size else (instances, )

    for count, batch in enumerate(batches):
        LOGGER.info("processing batch #%d...", count + 1)
        if not dry_run:
            model.objects.bulk_create(batch)

    LOGGER.info("done processing")
    def handle(self, *args, **kwargs):
        logging.basicConfig(
            stream=sys.stderr,
            level=logging.DEBUG if kwargs["verbosity"] > 1 else logging.INFO,
            format=
            "%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s",
        )

        LOGGER.info(kwargs)

        # pylint: disable=no-member
        game_ids = frozenset(Game.objects.order_by().values_list("bgg_id",
                                                                 flat=True))
        instances = self._create_all_instances(
            path=kwargs["path"],
            filter_ids=game_ids,
            week_day=kwargs["week_day"],
            types=kwargs["types"],
        )
        batches = (batchify(instances, kwargs["batch"]) if kwargs["batch"] else
                   (instances, ))

        for count, batch in enumerate(batches):
            LOGGER.info("Processing batch #%d...", count + 1)
            if kwargs["dry_run"]:
                for item in batch:
                    print(item)
            else:
                Ranking.objects.bulk_create(batch)

        LOGGER.info("Done filling the database.")
Exemple #3
0
    def _type_requests(self, types, batch_size=10):
        query_tmpl = normalize_space("""
            SELECT DISTINCT ?game WHERE {{
                ?game <http://www.wikidata.org/prop/direct/P31> ?type .
                VALUES ?type {{ {} }}
            }}""")

        if not batch_size:
            query = query_tmpl.format(" ".join(types))
            # self.logger.debug(query)
            yield Request(
                self.sparql_api_url,
                method="POST",
                body=urlencode({"query": query}),
                callback=self.parse_games,
                priority=1,
            )
            return

        for batch in batchify(types, batch_size):
            query = query_tmpl.format(" ".join(batch))
            # self.logger.debug(query)
            yield Request(self._api_url(query),
                          callback=self.parse_games,
                          priority=1)
    def _type_requests(self, types, batch_size=10):
        query_tmpl = (
            "SELECT DISTINCT ?game WHERE {{ ?game a ?type . VALUES ?type {{ {} }} }}"
        )

        for batch in batchify(types, batch_size):
            query = query_tmpl.format(" ".join(batch))
            # self.logger.debug(query)
            yield Request(self._api_url(query),
                          callback=self.parse_games,
                          priority=1)
def _create_secondary_instances(
        model,
        secondary,
        items,
        models_order=(),
        batch_size=None,
        dry_run=False,
        **kwargs,
):
    instances = _make_secondary_instances(
        model=model,
        secondary=secondary,
        items=items,
        **kwargs,
    )
    del items
    batches = batchify(instances, batch_size) if batch_size else (instances, )
    del instances
    models_order = tuple(arg_to_iter(models_order))

    for count, batch in enumerate(batches):
        LOGGER.info("processing batch #%d...", count + 1)

        models = defaultdict(list)
        for instance in batch:
            models[type(instance)].append(instance)
        order = models_order or tuple(models.keys())
        del batch

        for mdl in order:
            instances = models.pop(mdl, ())
            if not dry_run and instances:
                LOGGER.info("creating %d instances of %r", len(instances), mdl)
                mdl.objects.bulk_create(instances)

        if any(models.values()):
            LOGGER.warning(
                "some models have not been processed properly: %r",
                tuple(models.keys()),
            )

        del models

    del batches

    LOGGER.info("done processing")
Exemple #6
0
def split_files(path_in,
                path_out=None,
                size=None,
                fields=FIELDS,
                exclude_empty=False):
    """Split a JSON lines file into JSON files of a given size."""

    path_in = Path(path_in).resolve()
    path_out = "-" if path_out is None or path_out == "-" else Path(
        path_out).resolve()

    LOGGER.info("Reading items from <%s> splitting them into <%s>", path_in,
                path_out)

    if path_out != "-":
        path_out.parent.mkdir(parents=True, exist_ok=True)

    items = tuple(
        _load_items(path_in, fields=fields, exclude_empty=exclude_empty))
    batches = batchify(items, size) if size else (items, )
    total = len(items)
    count = 0

    LOGGER.info("Read %d items from <%s>", total, path_in)

    for i, batch in enumerate(batches):
        batch = list(batch)
        count += len(batch)
        result = {
            "count": total,
            "previous": i - 1 if i else None,
            "next": i + 1 if count < total else None,
            "results": batch,
        }

        if path_out == "-":
            json.dump(result, sys.stdout, sort_keys=True)
            print()

        else:
            out_path = str(path_out).format(number=i)
            LOGGER.info("Writing batch #%d to <%s>", i, out_path)
            with open(out_path, "w") as out_file:
                json.dump(result, out_file, sort_keys=True)

    LOGGER.info("Done splitting.")
Exemple #7
0
    def _game_requests(self,
                       *bgg_ids,
                       batch_size=10,
                       page=1,
                       priority=0,
                       **kwargs):
        bgg_ids = clear_list(map(parse_int, bgg_ids))

        if not bgg_ids:
            return

        bgg_ids = ((bgg_id for bgg_id in bgg_ids
                    if bgg_id not in self._ids_seen) if page == 1 else bgg_ids)

        for batch in batchify(bgg_ids, batch_size):
            batch = tuple(batch)

            ids = ",".join(map(str, batch))

            url = (self._api_url(
                action="thing",
                id=ids,
                stats=1,
                videos=1,
                versions=int(self.scrape_ratings),
                ratingcomments=int(self.scrape_ratings),
                page=1,
            ) if page == 1 else self._api_url(action="thing",
                                              id=ids,
                                              versions=1,
                                              ratingcomments=1,
                                              page=page))

            request = Request(url, callback=self.parse_game, priority=priority)

            if len(batch) == 1:
                request.meta["bgg_id"] = batch[0]
            request.meta["page"] = page
            request.meta.update(kwargs)

            yield request

            if page == 1:
                self._ids_seen.update(batch)
def _create_references(
    model,
    items,
    foreign=None,
    recursive=None,
    batch_size=None,
    dry_run=False,
):
    foreign = foreign or {}
    foreign = {k: tuple(arg_to_iter(v)) for k, v in foreign.items()}
    foreign = {k: v for k, v in foreign.items() if len(v) == 2}

    recursive = ({r: r
                  for r in arg_to_iter(recursive)}
                 if not isinstance(recursive, dict) else recursive)

    if not foreign and not recursive:
        LOGGER.warning(
            "neither foreign nor recursive references given, got nothing to do..."
        )
        return

    LOGGER.info("creating foreign references: %r", foreign)
    LOGGER.info("creating recursive references: %r", recursive)

    count = -1
    foreign_values = {f[0]: defaultdict(set) for f in foreign.values()}
    updates = {}

    for count, item in enumerate(items):
        update = defaultdict(list)

        for field, (fmodel, _) in foreign.items():
            for value in filter(
                    None, map(_parse_value_id, arg_to_iter(item.get(field)))):
                id_ = value.get("id")
                value = value.get("value")
                if id_ and value:
                    foreign_values[fmodel][id_].add(value)
                    update[field].append(id_)

        for rec_from, rec_to in recursive.items():
            rec = {parse_int(r) for r in arg_to_iter(item.get(rec_from)) if r}
            rec = (sorted(
                model.objects.filter(pk__in=rec).values_list(
                    "pk", flat=True).distinct()) if rec else None)
            if rec:
                update[rec_to] = rec

        pkey = parse_int(item.get(model._meta.pk.name))
        if pkey and any(update.values()):
            updates[pkey] = update

        if (count + 1) % 1000 == 0:
            LOGGER.info("processed %d items so far", count + 1)

    del items, recursive

    LOGGER.info("processed %d items in total", count + 1)

    for fmodel, value_field in frozenset(foreign.values()):
        id_field = fmodel._meta.pk.name
        LOGGER.info("found %d items for model %r to create",
                    len(foreign_values[fmodel]), fmodel)
        values = ({
            id_field: k,
            value_field: take_first(v)
        } for k, v in foreign_values[fmodel].items() if k and v)
        _create_from_items(
            model=fmodel,
            items=values,
            batch_size=batch_size,
            dry_run=dry_run,
        )

    del foreign, foreign_values

    LOGGER.info("found %d items for model %r to update", len(updates), model)

    batches = (batchify(updates.items(), batch_size) if batch_size else
               (updates.items(), ))

    for count, batch in enumerate(batches):
        LOGGER.info("processing batch #%d...", count + 1)
        if not dry_run:
            with atomic():
                for pkey, update in batch:
                    try:
                        instance = model.objects.get(pk=pkey)
                        for field, values in update.items():
                            getattr(instance, field).set(values)
                        instance.save()
                    except Exception:
                        LOGGER.exception(
                            "an error ocurred when updating <%s> with %r",
                            pkey,
                            update,
                        )

    del batches, updates

    LOGGER.info("done updating")