Beispiel #1
0
    def process_batch(self, batch: List[dict]) -> None:

        bulk_actions = []
        for obj in batch:
            bundle = IntermediateBundle.from_json(obj)
            assert bundle.doc_type
            es_doc = transform_heavy(bundle)
            if not es_doc:
                continue
            else:
                bulk_actions.append(
                    {
                        "_index": self.es_index,
                        "_op_type": "index",
                        "_id": es_doc.key,
                        "_source": es_doc.json(exclude_none=True, sort_keys=True),
                    }
                )
                self.counts["docs-indexed"] += 1

        if not bulk_actions:
            return

        elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="50s")
        self.counts["batches-indexed"] += 1
def run_refs(infile: Sequence) -> None:
    for line in infile:
        obj = json.loads(line)

        heavy = IntermediateBundle.from_json(obj)
        assert heavy.doc_type
        refs = refs_from_heavy(heavy)
        for ref in refs:
            print(ref.json(exclude_none=True, sort_keys=True))
def run_transform(infile: Sequence) -> None:
    for line in infile:
        obj = json.loads(line)

        heavy = IntermediateBundle.from_json(obj)
        assert heavy.doc_type
        es_doc = transform_heavy(heavy)
        if not es_doc:
            continue
        print(es_doc.json(exclude_none=True, sort_keys=True))