Esempi in Python per entity_from_json, esempi in Python per fatcat_tools.entity_from_json

Esempio n. 1

0

Mostra file

File: fatcat_transform.py Progetto: openAccess/fatcat

def run_elasticsearch_changelogs(args):
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client)
        args.json_output.write(
            json.dumps(changelog_to_elasticsearch(entity)) + '\n')

Esempio n. 2

0

Mostra file

File: fatcat_transform.py Progetto: openAccess/fatcat

def run_elasticsearch_releases(args):
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
        args.json_output.write(
            json.dumps(release_to_elasticsearch(entity)) + '\n')

Esempio n. 3

0

Mostra file

def run_elasticsearch_files(args):
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  FileEntity,
                                  api_client=args.api.api_client)
        if entity.state != 'active':
            continue
        args.json_output.write(
            json.dumps(file_to_elasticsearch(entity)) + '\n')

Esempio n. 4

0

Mostra file

File: fatcat_transform.py Progetto: timholds/fatcat

def run_citeproc_releases(args):
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  ReleaseEntity,
                                  api_client=args.api.api_client)
        csl_json = release_to_csl(entity)
        csl_json['id'] = "release:" + (entity.ident or "unknown")
        out = citeproc_csl(csl_json, args.style, args.html)
        args.json_output.write(out + "\n")

Esempio n. 5

0

Mostra file

File: fatcat_transform.py Progetto: openAccess/fatcat

def run_citeproc_releases(args):
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
        csl_json = release_to_csl(entity)
        # XXX:
        csl_json['id'] = "release:" + (entity.ident or "unknown")
        if args.style == "csl-json":
            args.json_output.write(json.dumps(csl_json) + "\n")
            continue
        bib_src = CiteProcJSON([csl_json])
        form = formatter.plain
        if args.html:
            form = formatter.html
        style_path = get_style_filepath(args.style)
        bib_style = CitationStylesStyle(style_path, validate=False)
        bib = CitationStylesBibliography(bib_style, bib_src, form)
        bib.register(Citation([CitationItem(csl_json['id'])]))
        # XXX:
        #args.json_output.write(
        #    json.dumps(release_to_csl(entity)) + '\n')
        lines = bib.bibliography()[0]
        if args.style == "bibtex":
            for l in lines:
                if l.startswith(" @"):
                    args.json_output.write("\n@")
                elif l.startswith(" "):
                    #print("line: START|{}|END".format(l))
                    args.json_output.write("\n  " + l)
                else:
                    args.json_output.write(l)
        else:
            args.json_output.write(''.join(lines) + "\n")
        print()

Esempio n. 6

0

Mostra file

    def run(self) -> None:
        ac = ApiClient()
        api = public_api(self.api_host)

        # only used by container indexing query_stats code path
        es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend)

        def fail_fast(err: Any, partitions: List[Any]) -> None:
            if err is not None:
                print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
                print("Bailing out...", file=sys.stderr)
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr)
                    print("Bailing out...", file=sys.stderr)
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            # print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print(
                "Kafka partitions rebalanced: {} / {}".format(consumer, partitions),
                file=sys.stderr,
            )

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update(
            {
                "group.id": self.consumer_group,
                "on_commit": fail_fast,
                # messages don't have offset marked as stored until pushed to
                # elastic, but we do auto-commit stored offsets to broker
                "enable.auto.commit": True,
                "enable.auto.offset.store": False,
                # user code timeout; if no poll after this long, assume user code
                # hung and rebalance (default: 5min)
                "max.poll.interval.ms": 60000,
                "default.topic.config": {
                    "auto.offset.reset": "latest",
                },
            }
        )
        consumer = Consumer(consumer_conf)
        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )

        while True:
            batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval)
            if not batch:
                if not consumer.assignment():
                    print("... no Kafka consumer partitions assigned yet", file=sys.stderr)
                print(
                    "... nothing new from kafka, try again (interval: {}".format(
                        self.poll_interval
                    ),
                    file=sys.stderr,
                )
                continue
            print("... got {} kafka messages".format(len(batch)), file=sys.stderr)
            # first check errors on entire batch...
            for msg in batch:
                if msg.error():
                    raise KafkaException(msg.error())
            # ... then process
            bulk_actions = []
            for msg in batch:
                json_str = msg.value().decode("utf-8")
                entity = entity_from_json(json_str, self.entity_type, api_client=ac)
                assert isinstance(entity, self.entity_type)
                if self.entity_type == ChangelogEntry:
                    key = entity.index
                    # might need to fetch from API
                    if not (
                        entity.editgroup  # pylint: disable=no-member # (TODO)
                        and entity.editgroup.editor  # pylint: disable=no-member # (TODO)
                    ):
                        entity = api.get_changelog_entry(entity.index)
                else:
                    key = entity.ident  # pylint: disable=no-member # (TODO)

                if self.entity_type != ChangelogEntry and entity.state == "wip":
                    print(
                        f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}",
                        file=sys.stderr,
                    )
                    continue

                if self.entity_type == ContainerEntity and self.query_stats:
                    stats = query_es_container_stats(
                        entity.ident,
                        es_client=es_client,
                        es_index=self.elasticsearch_release_index,
                        merge_shadows=True,
                    )
                    doc_dict = container_to_elasticsearch(entity, stats=stats)
                else:
                    doc_dict = self.transform_func(entity)

                # TODO: handle deletions from index
                bulk_actions.append(
                    json.dumps(
                        {
                            "index": {
                                "_id": key,
                            },
                        }
                    )
                )
                bulk_actions.append(json.dumps(doc_dict))

            # if only WIP entities, then skip
            if not bulk_actions:
                for msg in batch:
                    consumer.store_offsets(message=msg)
                continue

            print(
                "Upserting, eg, {} (of {} {} in elasticsearch)".format(
                    key, len(batch), self.entity_type.__name__
                ),
                file=sys.stderr,
            )
            elasticsearch_endpoint = "{}/{}/_bulk".format(
                self.elasticsearch_backend, self.elasticsearch_index
            )
            resp = requests.post(
                elasticsearch_endpoint,
                headers={"Content-Type": "application/x-ndjson"},
                data="\n".join(bulk_actions) + "\n",
            )
            resp.raise_for_status()
            if resp.json()["errors"]:
                desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
                print(desc, file=sys.stderr)
                print(resp.content, file=sys.stderr)
                raise Exception(desc)
            for msg in batch:
                # offsets are *committed* (to brokers) automatically, but need
                # to be marked as processed here
                consumer.store_offsets(message=msg)