Beispiel #1
0
def get_sickle():
    """
    Return a sickle OAI harvester for PMC
    """
    import sickle

    return sickle.Sickle(endpoint=endpoint)
Beispiel #2
0
    def fetch_date(self, date):

        api = sickle.Sickle(self.endpoint_url)
        date_str = date.isoformat()
        produce_topic = self.kafka.topics[self.produce_topic]
        # this dict kwargs hack is to work around 'from' as a reserved python keyword
        # recommended by sickle docs
        try:
            records = api.ListRecords(
                **{
                    'metadataPrefix': self.metadata_prefix,
                    'from': date_str,
                    'until': date_str,
                })
        except sickle.oaiexceptions.NoRecordsMatch:
            print("WARN: no OAI-PMH records for this date: {} (UTC)".format(
                date_str))
            return

        count = 0
        with produce_topic.get_producer() as producer:
            for item in records:
                count += 1
                if count % 50 == 0:
                    print("... up to {}".format(count))
                producer.produce(
                    item.raw.encode('utf-8'),
                    partition_key=item.header.identifier.encode('utf-8'))
Beispiel #3
0
 def __init__(self,
              oai_url="http://export.arxiv.org/oai2",
              metadata_format='arXivRaw'):
     self.metadata_format = metadata_format
     self.arxiv: OAIItemIterator = sickle.Sickle(oai_url,
                                                 iterator=OAIItemIterator)
     print(
         f"*** extracting metadata from {oai_url} in {metadata_format} format ***"
     )
Beispiel #4
0
def getSickle(url):
    """
    Create a Sickle instance

    Args:
        url: OAI-PMH service URL

    Returns:
        sickle.Sickle instance
    """
    return sickle.Sickle(url, encoding=DEFAULT_ENCODING)
Beispiel #5
0
    def fetch_date(self, date: datetime.date) -> None:
        def fail_fast(err: Any, _msg: Any) -> None:
            if err is not None:
                print("Kafka producer delivery error: {}".format(err),
                      file=sys.stderr)
                print("Bailing out...", file=sys.stderr)
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            "delivery.report.only.error": True,
            "default.topic.config": {
                "request.required.acks": -1,  # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        api = sickle.Sickle(self.endpoint_url,
                            max_retries=5,
                            retry_status_codes=[503])
        date_str = date.isoformat()
        # this dict kwargs hack is to work around 'from' as a reserved python keyword
        # recommended by sickle docs
        try:
            records = api.ListRecords(
                **{
                    "metadataPrefix": self.metadata_prefix,
                    "from": date_str,
                    "until": date_str,
                })
        except sickle.oaiexceptions.NoRecordsMatch:
            print(
                "WARN: no OAI-PMH records for this date: {} (UTC)".format(
                    date_str),
                file=sys.stderr,
            )
            return

        count = 0
        for item in records:
            count += 1
            if count % 50 == 0:
                print("... up to {}".format(count), file=sys.stderr)
            producer.produce(
                self.produce_topic,
                item.raw.encode("utf-8"),
                key=item.header.identifier.encode("utf-8"),
                on_delivery=fail_fast,
            )
        producer.flush()
def oaipmh_to_elastic(start_date, end_date=None, threads=0, chunk_size=None, url=None):
    es = set_up_elastic(url)
    proxy_url = os.getenv("STATIC_IP_PROXY")
    proxies = {"https": proxy_url, "http": proxy_url}
    base_sickle = sickle.Sickle("http://oai.base-search.net/oai", proxies=proxies)
    args = {'metadataPrefix': 'base_dc', 'from': start_date}
    if end_date:
        args["until"] = end_date
    oai_records = base_sickle.ListRecords(ignore_deleted=True, **args)

    records_to_save = []
    print 'chunk_size', chunk_size
    oai_record = safe_get_next_record(oai_records)
    while oai_record:
        record = {}
        record["id"] = oai_record.header.identifier
        record["base_timestamp"] = oai_record.header.datestamp
        record["added_timestamp"] = datetime.datetime.utcnow().isoformat()

        record["title"] = oai_tag_match("title", oai_record)
        record["license"] = oai_tag_match("rights", oai_record)
        try:
            record["oa"] = int(oai_tag_match("oa", oai_record))
        except TypeError:
            record["oa"] = 0

        record["urls"] = oai_tag_match("identifier", oai_record, return_list=True)
        record["authors"] = oai_tag_match("creator", oai_record, return_list=True)
        record["relations"] = oai_tag_match("relation", oai_record, return_list=True)
        record["sources"] = oai_tag_match("collname", oai_record, return_list=True)

        if is_complete(record):
            action_record = make_record_for_es(record)
            records_to_save.append(action_record)
            print ":",
        else:
            print ".",

        if len(records_to_save) >= 1000:
            save_records_in_es(es, records_to_save, threads, chunk_size)
            print "last record saved:", records_to_save[-1]
            print "last timestamp saved:", records_to_save[-1]["base_timestamp"]
            records_to_save = []

        oai_record = safe_get_next_record(oai_records)

    # make sure to get the last ones
    if records_to_save:
        save_records_in_es(es, records_to_save, 1, chunk_size)
        print "last record saved:", records_to_save[-1]
Beispiel #7
0
    def fetch_date(self, date):
        def fail_fast(err, msg):
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            'delivery.report.only.error': True,
            'default.topic.config': {
                'request.required.acks': -1,  # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        api = sickle.Sickle(self.endpoint_url)
        date_str = date.isoformat()
        # this dict kwargs hack is to work around 'from' as a reserved python keyword
        # recommended by sickle docs
        try:
            records = api.ListRecords(
                **{
                    'metadataPrefix': self.metadata_prefix,
                    'from': date_str,
                    'until': date_str,
                })
        except sickle.oaiexceptions.NoRecordsMatch:
            print("WARN: no OAI-PMH records for this date: {} (UTC)".format(
                date_str))
            return

        count = 0
        for item in records:
            count += 1
            if count % 50 == 0:
                print("... up to {}".format(count))
            producer.produce(self.produce_topic,
                             item.raw.encode('utf-8'),
                             key=item.header.identifier.encode('utf-8'),
                             on_delivery=fail_fast)
        producer.flush()