コード例 #1
0
ファイル: views.py プロジェクト: Huchikoma/internship_MMQ
def getRecord(request):
    """
    POST http://localhost/oai_pmh/api/rest/getrecord
    POST data query='{"url":"value", "identifier":"value", "metadataprefix":"value"}'
    """
    try:
        serializer = GetRecordSerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.DATA['url']
            identifier = request.DATA['identifier']
            metadataprefix = request.DATA['metadataprefix']
            sickle = Sickle(url)
            grResponse = sickle.GetRecord(metadataPrefix=metadataprefix, identifier=identifier)
            record = Record(grResponse.xml)
            rtn=[]
            rtn.append({"identifier": record.header.identifier,
                      "datestamp": record.header.datestamp,
                      "deleted": record.deleted,
                      "sets": record.header.setSpecs,
                      "metadataPrefix": metadataprefix,
                      "metadata": etree.tostring(record.xml.find('.//' + '{http://www.openarchives.org/OAI/2.0/}' +
                                                                 'metadata/')) if not record.deleted else None,
                      "raw": record.raw})

            serializer = RecordSerializer(rtn)
            return Response(serializer.data, status=status.HTTP_200_OK)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except OAIAPIException as e:
        return e.response()
    except Exception as e:
        content = APIMessage.getMessageLabelled('An error occurred when attempting to retrieve record. %s'%e)
        return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
コード例 #2
0
ファイル: utils.py プロジェクト: llcit/llt
def get_bitstream_url(collection, record_in):
    """ Harvests an href pointing to the bitstream urls for the record in repository.
    E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf
    """

    sickle = Sickle(collection.community.repository.base_url)
    sickle.class_mapping['GetRecord'] = LltRecordBitstream
    record = sickle.GetRecord(metadataPrefix='ore',
                              identifier=record_in.header.identifier)

    bitstreams = {'bitstream': None, 'bitstream_txt': None}

    try:
        bitstreams['bitstream'] = record.metadata['bitstream']
    except Exception as e:
        print(e, 'Unable to construct bitstream url for',
              record_in.header.identifier)

    try:
        bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][
            0].replace('+', '%20')
    except Exception as e:
        print(e, 'Unable to construct bitstream_txt url for',
              record_in.header.identifier)

    return bitstreams
コード例 #3
0
def get_records(identifiers, metadata_prefix=None, url=None, name=None,
                encoding=None):
    """Harvest specific records from an OAI repo via OAI-PMH identifiers.

    :param metadata_prefix: The prefix for the metadata return
                            (defaults to 'oai_dc').
    :param identifiers: list of unique identifiers for records to be harvested.
    :param url: The The url to be used to create the endpoint.
    :param name: The name of the OAIHarvestConfig to use instead of passing
                 specific parameters.
    :param encoding: Override the encoding returned by the server. ISO-8859-1
                     if it is not provided by the server.
    :return: request object, list of harvested records
    """
    if name:
        url, _metadata_prefix, _, __ = get_info_by_oai_name(name)

        # In case we provide a prefix, we don't want it to be
        # overwritten by the one we get from the name variable.
        if metadata_prefix is None:
            metadata_prefix = _metadata_prefix
    elif not url:
        raise NameOrUrlMissing(
            "Retry using the parameters -n <name> or -u <url>."
        )

    request = Sickle(url, encoding=encoding)
    records = []
    for identifier in identifiers:
        arguments = {
            'identifier': identifier,
            'metadataPrefix': metadata_prefix or "oai_dc"
        }
        records.append(request.GetRecord(**arguments))
    return request, records
コード例 #4
0
def get_record_metadata(repository_url, identifier):
    sickle = Sickle(repository_url)
    rec = sickle.GetRecord(
        identifier=identifier,
        metadataPrefix='oai_dc'
    )
    return rec.metadata
コード例 #5
0
 def parse_single(self, response):
     sickle = Sickle(self.url)
     params = {
         'metadataPrefix': self.format,
         'identifier': response.meta['identifier'],
     }
     record = sickle.GetRecord(**params)
     self._crawled_records[params['identifier']] = record
     response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
     selector = Selector(response, type='xml')
     return self.parse_record(selector)
コード例 #6
0
def get_xml_1(oai_identifier):
    sickle = Sickle("https://dspace.cuni.cz/oai/nusl")
    record = sickle.GetRecord(metadataPrefix="xoai", identifier=oai_identifier)
    file_directory = Path(__file__).parent
    target_directory = file_directory / ".." / "tests" / "data"
    oai_identifier_array = oai_identifier.split(":")
    oai_identifier_fixed = oai_identifier_array[-1]
    oai_identifier_fixed = oai_identifier_fixed.replace(".", "_")
    oai_identifier_fixed = oai_identifier_fixed.replace("/", "-")
    filename = str(target_directory / f"{oai_identifier_fixed}.xml")
    with open(filename, "w+") as f:
        f.write(record.raw)
    print(filename, "created")
コード例 #7
0
ファイル: cli.py プロジェクト: unm-art/oai-pmh-harvester
def harvest(host, from_date, until, format, out, set, verbose):
    counter = 0

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    logging.info("OAI-PMH harvesting from %s", host)
    logging.info("From date = %s", from_date)
    logging.info("Until date = %s", until)
    logging.info("Metadata format = %s", format)
    logging.info("Outfile = %s", out)

    mysickle = Sickle(host, iterator=OAIItemIterator)
    params = {'metadataPrefix': format, 'from': from_date, 'until': until}
    if set is not None:
        params['set'] = set
    try:
        responses = mysickle.ListIdentifiers(**params)
    except NoRecordsMatch:
        logging.info("No records harvested: the combination of the values of "
                     "the arguments results in an empty list.")
        sys.exit()

    identifier_list = []

    for records in responses:
        identifier_list.append(records.identifier)

    logging.info(f"Identifier count to harvest: {len(identifier_list)}")

    with open(out, 'wb') as f:
        f.write('<records>'.encode())

        for identifier in identifier_list:
            r = mysickle.GetRecord(identifier=identifier,
                                   metadataPrefix=format)
            f.write(r.raw.encode('utf8'))
            logging.debug(counter)
            logging.debug(r.raw)
            counter += 1

        f.write('</records>'.encode())

    logging.info("Total records harvested: %i", counter)
コード例 #8
0
def oai_get_record(id,
                   name,
                   transformation,
                   record_cls,
                   access_token=None,
                   identifier=None,
                   dbcommit=False,
                   reindex=False,
                   test_md5=False,
                   verbose=False,
                   debug=False,
                   **kwargs):
    """Get record from an OAI repo.

    :param identifier: identifier of record.
    """
    url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name)

    request = Sickle(url)

    params = {}
    if access_token:
        params['accessToken'] = access_token

    params['metadataPrefix'] = metadata_prefix
    params['identifier'] = f'{identifier}{id}'
    try:
        record = request.GetRecord(**params)
    except Exception as err:
        if debug:
            raise Exception(err)
        return None
    records = parse_xml_to_array(StringIO(record.raw))
    trans_record = transformation(records[0]).json
    if verbose:
        click.echo(f'OAI-{name} get: {id}')
    return trans_record
コード例 #9
0
ファイル: test_harvesting.py プロジェクト: titabo2k/sickle
class TestCaseWrongEncoding(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCaseWrongEncoding, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.requests.get', mock_get)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)
        self.assertIn(u'某人', record.metadata['creator'])
コード例 #10
0
class OAISynchronizer:
    """

    """
    def __init__(self,
                 name,
                 provider_code,
                 oai_endpoint,
                 metadata_prefix,
                 set_,
                 constant_fields: dict = None,
                 parser: Callable = None,
                 transformer=None,
                 endpoints=None,
                 default_endpoint: str = "recid",
                 endpoint_mapping=None,
                 pid_field=None,
                 from_: str = None,
                 endpoint_handler: dict = None,
                 bulk: bool = True,
                 pre_processors: dict = None,
                 post_processors: dict = None,
                 index: str = None):

        # Counters
        self.only_fetch = False
        self.deleted = 0
        self.created = 0
        self.modified = 0

        if endpoint_mapping is None:  # pragma: no cover
            endpoint_mapping = {}
        if pid_field is None:
            self.pid_field = current_app.config.get('PIDSTORE_RECID_FIELD',
                                                    "recid")
        else:  # pragma: no cover
            self.pid_field = pid_field
        self.name = name
        self.provider_code = provider_code
        self.metadata_prefix = metadata_prefix
        self.oai_endpoint = oai_endpoint
        self.oai_sync = None
        self.sickle = Sickle(self.oai_endpoint)
        self.parser = parser
        self.transformer = transformer
        self.endpoints = endpoints
        self.default_endpoint = default_endpoint
        self.endpoint_mapping = endpoint_mapping
        self.set_ = set_
        if constant_fields:
            self.constant_fields = constant_fields
        else:
            self.constant_fields = {}
        self._from = None
        if from_:
            self.from_ = from_
        self.endpoint_handler = endpoint_handler
        self.bulk = bulk
        self.pre_processors = pre_processors
        self.post_processors = post_processors
        self.overwrite = False
        self.es_client = current_search_client
        self._index = index

    @property
    def index(self):
        if self._index:
            _index = self._index
        else:
            _index = f"{self.provider_code}_{self.metadata_prefix}"
        if not self.es_client.indices.exists(_index):  # pragma: no cover
            current_search_client.indices.create(index=_index,
                                                 ignore=400,
                                                 body={})
        return _index

    @property
    def from_(self):
        return self._from

    @from_.setter
    def from_(self, value):
        if value == "latest":
            last_sync = OAISync.query.order_by(OAISync.id.desc()).first()
            if last_sync:
                self._from = arrow.get(last_sync)
        elif value is not None:
            if isinstance(value, arrow.Arrow):
                self._from = value
            else:
                self._from = arrow.get(value)
        else:
            self._from = None

    def run(self,
            start_oai: str = None,
            start_id: int = 0,
            break_on_error: bool = True,
            oai_id: Union[str, List[str]] = None,
            overwrite: bool = False,
            only_fetch: bool = False,
            index: str = None):
        """

        :return:
        :rtype:
        """
        if index:
            self._index = index
        self.only_fetch = only_fetch
        self.overwrite = overwrite
        self.restart_counters()
        with db.session.begin_nested():
            self.oai_sync = OAISync(
                provider_code=self.provider_code,
                synchronizer_code=self.name,
                sync_start=arrow.utcnow().
                datetime,  # datetime.datetime.utcnow(),
                status="active",
                purpose="fetch" if only_fetch else "sync")
            db.session.add(self.oai_sync)
        db.session.commit()
        try:
            if oai_id:
                if isinstance(oai_id, str):
                    oai_ids = [oai_id]
                elif isinstance(oai_id, list):
                    oai_ids = oai_id
                else:  # pragma: no cover
                    raise Exception(
                        "OAI identifier must be string or list of strings")
                self.synchronize(identifiers=oai_ids,
                                 break_on_error=break_on_error)
                self.update_oai_sync("ok")
            else:
                self.synchronize(start_oai=start_oai,
                                 start_id=start_id,
                                 break_on_error=break_on_error)
                self.update_oai_sync("ok")
        except:
            self.update_oai_sync("failed")
            raise
        finally:
            db.session.commit()

    def update_oai_sync(self, status):
        with db.session.begin_nested():
            # self.oai_sync = db.session.merge(self.oai_sync)
            self.oai_sync.status = status
            self.oai_sync.sync_end = arrow.utcnow(
            ).datetime  # datetime.datetime.utcnow()
            self.oai_sync.records_modified = self.modified
            self.oai_sync.records_created = self.created
            self.oai_sync.records_deleted = self.deleted
            if status == "failed":
                self.oai_sync.logs = traceback.format_exc()
            db.session.add(self.oai_sync)
        db.session.commit()

    def synchronize(self,
                    identifiers=None,
                    start_oai: str = None,
                    start_id: int = 0,
                    break_on_error: bool = True):  # pragma: no cover
        """

        :return:
        :rtype:
        """
        logger.info(
            f"OAI harvester on endpoint: {self.oai_endpoint} has started!")

        if not self.bulk:
            identifiers = self._get_identifiers(identifiers, start_id)
            for idx, identifier in enumerate(identifiers, start=start_id):
                self.record_handling(idx, start_oai, break_on_error,
                                     identifier)
        else:
            records = self._get_records_iterator(start_id,
                                                 list_identifiers=identifiers)
            print("Waiting for server...")
            t0 = datetime.now()
            for idx, record in enumerate(records, start=start_id):
                logger.debug(f"Time for record: {datetime.now()-t0}")
                t0 = datetime.now()
                self.record_handling(idx,
                                     start_oai,
                                     break_on_error,
                                     xml=record.xml)
                dt = datetime.now() - t0
                logger.debug(f"Time for record_handling: {dt}")

    def _get_records_iterator(self,
                              start_id: int = 0,
                              list_identifiers: List[str] = None):
        if self.from_:
            records = self.sickle.ListRecords(
                **{
                    "metadataPrefix": self.metadata_prefix,
                    "set": self.set_,
                    "from": self.from_.format("YYYY-MM-DD")
                })
        else:
            records = self.sickle.ListRecords(
                metadataPrefix=self.metadata_prefix, set=self.set_)
        if list_identifiers:
            return self.record_filter_generator(records, list_identifiers)
        else:
            return islice(records, start_id, None)

    def record_filter_generator(self, iterator, identifiers_list):
        for record in iterator:
            if record.header.identifier in identifiers_list:
                yield record

    def record_handling(self,
                        idx,
                        start_oai: str = None,
                        break_on_error: bool = True,
                        identifier: Header = None,
                        xml: _Element = None,
                        only_fetch: bool = None):
        if not only_fetch:
            only_fetch = self.only_fetch
        if not (identifier or xml):  # pragma: no cover
            raise Exception("Must provide header or xml")
        if identifier and xml:  # pragma: no cover
            raise Exception("You must provide only header or xml")
        if identifier:
            datestamp, deleted, oai_identifier = get_oai_header_data(
                identifier)
        else:
            datestamp, deleted, oai_identifier = get_oai_header_data(xml=xml)
        logger.info(f"{idx}. Record, OAI ID: '{oai_identifier}'")
        oai_rec = OAIRecord.get_record(oai_identifier)
        if not start_oai or oai_identifier == start_oai:  # pragma: no cover TODO: vyřešit
            # start_oai/není implemntováno
            collect = True
        else:
            collect = False
        if not collect:  # pragma: no cover
            return
        try:
            self.record_crud(oai_rec,
                             timestamp=datestamp,
                             deleted=deleted,
                             idx=idx,
                             oai_identifier=oai_identifier,
                             xml=xml,
                             only_fetch=only_fetch)
        except Exception:  # pragma: no cover
            self.exception_handler(oai_identifier)
            if break_on_error:
                raise
            return

    def exception_handler(self, oai_identifier):
        exc = traceback.format_exc()
        print(exc, "\n\n\n")
        oai_exc = OAIRecordExc.query.filter_by(
            oai_identifier=oai_identifier,
            oai_sync_id=self.oai_sync.id).one_or_none()
        if not oai_exc:
            oai_exc = OAIRecordExc(oai_identifier=oai_identifier,
                                   traceback=exc,
                                   oai_sync_id=self.oai_sync.id)
            db.session.add(oai_exc)
        else:
            oai_exc.traceback = exc
        db.session.commit()

    def record_crud(self,
                    oai_rec: OAIRecord = None,
                    oai_identifier: str = None,
                    timestamp: str = arrow.utcnow().isoformat(),
                    deleted: bool = False,
                    xml: _Element = None,
                    idx: int = 0,
                    only_fetch: bool = False):
        if not (oai_rec or oai_identifier):
            raise Exception("You have to provide oai_rec or oai_identifier")
        if not oai_identifier:
            oai_identifier = oai_rec.oai_identifier
        if only_fetch:
            if deleted:
                self.delete_es(oai_identifier)
            else:
                self.create_or_update_es(oai_identifier, xml=xml)
        else:
            if deleted:
                self._delete(oai_rec)
            else:
                try:
                    self.create_or_update(oai_identifier,
                                          timestamp,
                                          oai_rec=oai_rec,
                                          xml=xml)
                except IdDoesNotExist:  # pragma: no cover
                    self._delete(oai_rec)
            if idx % 100:
                db.session.commit()

    def _get_identifiers(self, identifiers=None, start_id: int = 0):
        if identifiers is None:
            identifiers = self._get_oai_identifiers()
        else:
            identifiers = self._get_oai_identifiers(
                identifiers_list=identifiers)
        identifiers = islice(identifiers, start_id, None)
        return identifiers

    def _delete(self, oai_rec):
        if not oai_rec:
            return
        self.delete_record(oai_rec)
        self.deleted += 1
        logger.info(
            f"Identifier '{oai_rec.oai_identifier}' has been marked as deleted"
        )

    def _get_oai_identifiers(self,
                             sickle=None,
                             metadata_prefix=None,
                             set_=None,
                             identifiers_list: List[str] = None,
                             from_: arrow.Arrow = None):
        if identifiers_list:
            return [
                self.sickle.GetRecord(
                    identifier=identifier,
                    metadataPrefix=self.metadata_prefix).header
                for identifier in identifiers_list
            ]
        if not sickle:
            sickle = self.sickle
        if not metadata_prefix:
            metadata_prefix = self.metadata_prefix
        if not set_:
            set_ = self.set_
        if not from_:
            if self.from_:
                from_ = self.from_
            else:
                return sickle.ListIdentifiers(metadataPrefix=metadata_prefix,
                                              set=set_)
        return sickle.ListIdentifiers(
            **{
                "metadataPrefix": metadata_prefix,
                "set": set_,
                "from": from_.format("YYYY-MM-DD")
            })

    def create_or_update(self,
                         oai_identifier,
                         datestamp: str,
                         oai_rec=None,
                         xml: _Element = None):
        if oai_rec:
            if not self.overwrite:
                our_datestamp = arrow.get(oai_rec.timestamp)
                oai_record_datestamp = arrow.get(datestamp)
                if our_datestamp >= oai_record_datestamp:
                    logger.info(
                        f'Record with oai_identifier "{oai_identifier}" already exists'
                    )
                    return
        if not xml:
            xml = self.get_xml(oai_identifier)
        parsed = self.parse(xml)
        if self.pre_processors:
            for processor in self.pre_processors:
                parsed = processor(parsed)
        transformed = self.transform(parsed)
        if self.post_processors:
            for processor in self.post_processors:
                transformed = processor(transformed)
        transformed.update(self.constant_fields)

        if oai_rec is None:
            record, pid = self.create_record(transformed)
            oai_rec = OAIRecord(
                id=record.id,
                # oai_identifier=oai_identifier,
                creation_sync_id=self.oai_sync.id,
                pid=pid.pid_value)
            oai_identifier = OAIIdentifier(oai_record_id=oai_rec.id,
                                           oai_identifier=oai_identifier)
            self.created += 1
            db.session.add(oai_rec)
            oai_rec.oai_identifiers.append(oai_identifier)
            logger.info(
                f"Identifier '{oai_identifier}' has been created and '{record.id}' has been "
                f"assigned as a UUID")
        else:
            record = self.update_record(oai_rec, transformed)
            self.modified += 1
            oai_rec.modification_sync_id = self.oai_sync.id
            logger.info(
                f"Identifier '{oai_identifier}' has been updated (UUID: {record.id})"
            )
        oai_rec.last_sync_id = self.oai_sync.id
        oai_rec.timestamp = arrow.get(datestamp).datetime
        return record

    def create_or_update_es(self,
                            oai_identifier,
                            xml: _Element = None,
                            index: str = None):
        if not index:
            index = self.index
        if not xml:
            xml = self.get_xml(oai_identifier)
        parsed = transform_to_dict(self.parse(xml))

        try:
            es_record = self.es_client.get(id=oai_identifier, index=index)
        except NotFoundError:
            es_record = None

        if es_record is None:
            self.es_client.create(index, oai_identifier, parsed)
            logger.info(f'Record {oai_identifier} was created in ES')
        else:
            self.es_client.update(index=index,
                                  id=oai_identifier,
                                  body={"doc": parsed})
            logger.info(f'Record {oai_identifier} was updated in ES')
        return parsed

    def transform(self, parsed, handler=None):
        if not handler:
            handler = self.transformer.transform
        return handler(parsed)

    def get_xml(self, oai_identifier, retry=True):
        try:
            original_record = self.sickle.GetRecord(
                identifier=oai_identifier, metadataPrefix=self.metadata_prefix)
        except HTTPError:
            if retry:
                time.sleep(1)
                original_record = self.sickle.GetRecord(
                    identifier=oai_identifier,
                    metadataPrefix=self.metadata_prefix)
            else:
                raise

        return original_record.xml

    def parse(self, xml_etree, parser=None):
        if not parser or not callable(parser):
            if self.parser:
                parser = self.parser
            if parser is None:
                raise ParserNotFoundError(
                    "No parser specified, please check entry points and parser designation by "
                    "decorator @Decorators.parser or specify parser as function parameter."
                )
        return parser(xml_etree)

    def create_record(self, data):
        endpoint_config = self.get_endpoint_config(data)
        minter = self.get_minter(data, endpoint_config=endpoint_config)
        record_class = self.get_record_class(data,
                                             endpoint_config=endpoint_config)
        indexer_class = self.get_indexer_class(data,
                                               endpoint_config=endpoint_config)

        # Create uuid for record
        record_uuid = uuid.uuid4()
        # Create persistent identifier
        pid = minter(record_uuid, data=data)
        # Create record
        try:
            record = record_class.create(data, id_=pid.object_uuid)
        except:
            db.session.rollback()
            raise
        else:
            db.session.commit()

        # Index the record
        if indexer_class:
            indexer_class().index(record)

        return record, pid

    def update_record(self, oai_rec, data):
        endpoint_config = self.get_endpoint_config(data)
        indexer_class = self.get_indexer_class(data,
                                               endpoint_config=endpoint_config)
        record_class = self.get_record_class(data,
                                             endpoint_config=endpoint_config)
        fetcher = self.get_fetcher(data)
        try:
            record = record_class.get_record(oai_rec.id)
        except NoResultFound:
            record = record_class.get_record(oai_rec.id, with_deleted=True)
            record.revert(-2)
            record.update(record.model.json)
        fetched_pid = fetcher(oai_rec.id, dict(record))
        record.clear()
        record.update(data)
        record[self.pid_field] = fetched_pid.pid_value
        record.commit()
        db.session.commit()
        if indexer_class:
            indexer_class().index(record)
        return record

    def delete_record(self, oai_rec):
        if not oai_rec:
            return
        indexer_class = self.get_indexer_class()

        record = Record.get_record(oai_rec.id)
        record.delete()
        # TODO: rozmyslet se jak nakládat s PIDy
        # # mark all PIDs as DELETED
        # all_pids = PersistentIdentifier.query.filter(
        #     PersistentIdentifier.object_uuid == record.id,
        # ).all()
        # for rec_pid in all_pids:
        #     if not rec_pid.is_deleted():
        #         rec_pid.delete()

        db.session.commit()
        if indexer_class:
            indexer_class().delete(record)

    def get_endpoint_config(self, data):
        endpoint_name = None
        if not data:
            data = {}
        if self.endpoint_mapping:
            endpoint_name = self.endpoint_mapping["mapping"].get(
                data.get(self.endpoint_mapping["field_name"]))
        if not endpoint_name and self.endpoint_handler:
            provider = self.endpoint_handler.get(self.provider_code)
            if provider:
                handler = provider.get(self.metadata_prefix)
                if handler:
                    endpoint_name = handler(data)
        draft_configs = current_app.config.get("RECORDS_DRAFT_ENDPOINTS")
        if draft_configs:
            draft_endpoint_config = draft_configs.get(endpoint_name)
            if draft_endpoint_config:
                draft_endpoint_name = draft_endpoint_config.get("draft")
                if draft_endpoint_name:
                    endpoint_name = draft_endpoint_name
        endpoint_config = self.endpoints.get(
            endpoint_name) or self.endpoints.get(self.default_endpoint)
        return endpoint_config

    def get_minter(self, data=None, endpoint_config=None):
        if not endpoint_config:
            endpoint_config = self.get_endpoint_config(data)
        minter_name = endpoint_config["pid_minter"]
        return current_pidstore.minters.get(minter_name)

    def get_fetcher(self, data=None, endpoint_config=None):
        if not endpoint_config:
            endpoint_config = self.get_endpoint_config(data)
        fetcher_name = endpoint_config["pid_fetcher"]
        return current_pidstore.fetchers.get(fetcher_name)

    def get_record_class(self, data=None, endpoint_config=None):
        if not endpoint_config:
            endpoint_config = self.get_endpoint_config(data)
        record_class = endpoint_config["record_class"]
        return obj_or_import_string(record_class)

    def get_indexer_class(self, data=None, endpoint_config=None):
        if not endpoint_config:
            endpoint_config = self.get_endpoint_config(data)
        indexer_class = endpoint_config.get(
            "indexer_class", 'invenio_indexer.api.RecordIndexer')
        return obj_or_import_string(indexer_class)

    def restart_counters(self):
        self.deleted = 0
        self.created = 0
        self.modified = 0

    def delete_es(self, oai_identifier):
        try:
            self.es_client.get(id=oai_identifier, index=self.index)
            self.es_client.delete(index=self.index, id=oai_identifier)
        except NotFoundError:
            pass
コード例 #11
0
ファイル: test_harvesting.py プロジェクト: tulibraries/sickle
class TestCase(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                              ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
コード例 #12
0
    def harvest(self, request):
        ## harvest (Harvester object, request = [community, source, verb, mdprefix, mdsubset])
        # Harvest all files with <mdprefix> and <mdsubset> from <source> via sickle module and store those to hard drive.
        #
        # Parameters:
        # -----------
        # (list)  request - A list with following items:
        #                    1. community
        #                    2. source (OAI URL)
        #                    3. verb (ListIdentifiers, ListRecords or JSONAPI)
        #                    4. mdprefix (OAI md format as oai_dc, iso etc.)
        #                    5. mdsubset
        #
        # Return Values:
        # --------------
        # 1. (integer)  is -1 if something went wrong

        # create a request dictionary:
        req = {
            "community": request[0],
            "url": request[1],
            "lverb": request[2],
            "mdprefix": request[3],
            "mdsubset": request[4] if len(request) > 4 else None
        }

        # create dictionary with stats:
        resKeys = ['count', 'tcount', 'ecount', 'time']
        results = dict.fromkeys(resKeys, 0)

        stats = {
            "tottcount": 0,  # total number of provided datasets
            "totcount": 0,  # total number of successful harvested datasets
            "totecount": 0,  # total number of failed datasets
            "totdcount": 0,  # total number of all deleted datasets
            "tcount": 0,  # number of all provided datasets per subset
            "count":
            0,  # number of all successful harvested datasets per subset
            "ecount": 0,  # number of all failed datasets per subset
            "dcount": 0,  # number of all deleted datasets per subset
            "timestart": time.time(),  # start time per subset process
        }

        # the gbif api client
        class GBIF_CLIENT(object):

            # call action api:
            ## GBIF.action('package_list',{})

            def __init__(self, api_url):  ##, api_key):
                self.api_url = api_url
                self.logger = logging.getLogger('root')

            def JSONAPI(self, action, offset, chunklen, key):
                ## JSONAPI (action) - method
                return self.__action_api(action, offset, chunklen, key)

            def __action_api(self, action, offset, chunklen, key):
                # Make the HTTP request for get datasets from GBIF portal
                response = ''
                rvalue = 0
                ## offset = 0
                limit = chunklen  ## None for DataCite-JSON !!!
                api_url = self.api_url
                if key:
                    action_url = "{apiurl}/{action}/{key}".format(
                        apiurl=api_url, action=action, key=str(key))
                elif offset == None:
                    action_url = "{apiurl}/{action}".format(apiurl=api_url,
                                                            action=action)
                else:
                    action_url = "{apiurl}/{action}?offset={offset}&limit={limit}".format(
                        apiurl=api_url,
                        action=action,
                        offset=str(offset),
                        limit=str(limit))

                self.logger.debug('action_url: %s' % action_url)
                try:
                    request = Request(action_url)
                    response = urlopen(request)
                except HTTPError as e:
                    self.logger.error(
                        '%s : The server %s couldn\'t fulfill the action %s.' %
                        (e.code, self.api_url, action))
                    if (e.code == 403):
                        self.logger.critical(
                            'Access forbidden, maybe the API key is not valid?'
                        )
                        exit(e.code)
                    elif (e.code == 409):
                        self.logger.critical(
                            'Maybe you have a parameter error?')
                        return {"success": False}
                    elif (e.code == 500):
                        self.logger.critical('Internal server error')
                        exit(e.code)
                except URLError as e:
                    exit('%s' % e.reason)
                else:
                    out = json.loads(response.read())
                    assert response.code >= 200
                    return out

        requests_log = logging.getLogger("requests")
        requests_log.setLevel(logging.WARNING)

        # if the number of files in a subset dir is greater than <count_break>
        # then create a new one with the name <set> + '_' + <count_set>
        count_break = 5000
        count_set = 1
        start = time.time()

        # set subset:
        mdsubset = req["mdsubset"]
        if (not mdsubset):
            subset = 'SET'
        elif mdsubset.endswith(
                '_'
        ):  # no OAI subsets, but different OAI-URLs for same community
            subset = mdsubset[:-1]
            mdsubset = None
        elif len(mdsubset) > 2 and mdsubset[-1].isdigit(
        ) and mdsubset[-2] == '_':
            subset = mdsubset[:-2]
        else:
            subset = mdsubset
            if req["community"] == "b2share" or re.match(
                    r'http(.*?)b2share(.*?)api(.*?)', req["url"]):
                setMapFile = '%s/mapfiles/b2share_mapset.json' % (os.getcwd())
            elif req["community"] == "dara" and req[
                    "url"] == "https://www.da-ra.de/oaip/oai":
                setMapFile = '%s/mapfiles/dara_mapset.json' % (os.getcwd())
            else:
                setMapFile = None
            if setMapFile:
                with open(setMapFile) as sm:
                    setMap = json.load(sm)
                    if mdsubset in setMap:
                        mdsubset = setMap[mdsubset]

        if (self.fromdate):
            subset = subset + '_f' + self.fromdate

        self.logger.debug(' |- Subset:    \t%s' % subset)

        # make subset dir:
        subsetdir = '/'.join([
            self.base_outdir, req['community'] + '-' + req['mdprefix'],
            subset + '_' + str(count_set)
        ])

        noffs = 0  # set to number of record, where harvesting should start
        stats['tcount'] = noffs
        fcount = 0
        oldperc = 0
        ntotrecs = 0
        choffset = 0
        chunklen = 1000
        pageno = 1
        records = list()

        ## JSON-API
        jsonapi_verbs = ['dataset', 'works', 'records']
        if req["lverb"] in jsonapi_verbs:
            GBIF = GBIF_CLIENT(req['url'])  # create GBIF object
            harvestreq = getattr(GBIF, 'JSONAPI', None)
            outtypedir = 'hjson'
            outtypeext = 'json'
            if mdsubset and req["lverb"] == 'works':
                haction = 'works?publisher-id=' + mdsubset
                dresultkey = 'data'
            elif req["lverb"] == 'records':
                haction = req["lverb"]
                if mdsubset:
                    haction += '?q=community:' + mdsubset + '&size=' + str(
                        chunklen) + '&page=' + str(pageno)
                dresultkey = 'hits'
            else:
                haction = req["lverb"]
                dresultkey = 'results'
            try:
                chunk = harvestreq(
                    **{
                        'action': haction,
                        'offset': None,
                        'chunklen': chunklen,
                        'key': None
                    })
                self.logger.debug(" Got first %d records : chunk['data'] %s " %
                                  (chunklen, chunk[dresultkey]))
            except (HTTPError, ConnectionError, Exception) as e:
                self.logger.critical(
                    "%s :\n\thaction %s\n\tharvest request %s\n" %
                    (e, haction, req))
                return -1

            if req["lverb"] == 'dataset':
                while ('endOfRecords' in chunk and not chunk['endOfRecords']):
                    if 'results' in chunk:
                        records.extend(chunk['results'])
                    choffset += chunklen
                    chunk = harvestreq(
                        **{
                            'action': haction,
                            'offset': choffset,
                            'chunklen': chunklen,
                            'key': None
                        })
                    self.logger.debug(
                        " Got next records [%d,%d] from chunk %s " %
                        (choffset, choffset + chunklen, chunk))
            elif req["lverb"] == 'records':
                records.extend(chunk['hits']['hits'])
                while ('hits' in chunk and 'next' in chunk['links']):
                    if 'hits' in chunk:
                        records.extend(chunk['hits']['hits'])
                    pageno += 1
                    chunk = harvestreq(
                        **{
                            'action': haction,
                            'page': pageno,
                            'size': chunklen,
                            'key': None
                        })
                    self.logger.debug(
                        " Got next records [%d,%d] from chunk %s " %
                        (choffset, choffset + chunklen, chunk))
            else:
                if 'data' in chunk:
                    records.extend(chunk['data'])

        # OAI-PMH (verb = ListRecords/Identifier )
        elif req["lverb"].startswith('List'):
            sickle = Sickle(req['url'], max_retries=3, timeout=300)
            outtypedir = 'xml'
            outtypeext = 'xml'
            harvestreq = getattr(sickle, req["lverb"], None)
            try:
                records, rc = tee(
                    harvestreq(
                        **{
                            'metadataPrefix': req['mdprefix'],
                            'set': mdsubset,
                            'ignore_deleted': True,
                            'from': self.fromdate
                        }))
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, etree.XMLSyntaxError, CannotDisseminateFormat,
                    Exception) as err:
                self.logger.critical("%s during harvest request %s\n" %
                                     (err, req))
                return -1

        # CSW2.0
        elif req["lverb"].startswith('csw'):
            outtypedir = 'xml'
            outtypeext = 'xml'
            startposition = 0
            maxrecords = 20
            try:
                src = CatalogueServiceWeb(req['url'])
                NS = Namespaces()
                namespaces = NS.get_namespaces()
                if req['mdprefix'] == 'iso19139' or req['mdprefix'] == 'own':
                    nsp = namespaces['gmd']
                else:
                    nsp = namespaces['csw']

                harvestreq = getattr(src, 'getrecords2')
                chunk = harvestreq(
                    **{
                        'esn': 'full',
                        'startposition': choffset,
                        'maxrecords': maxrecords,
                        'outputschema': nsp
                    })
                chunklist = list(src.records.items())
                while (len(chunklist) > 0):
                    records.extend(chunklist)
                    choffset += maxrecords
                    chunk = harvestreq(
                        **{
                            'esn': 'full',
                            'startposition': choffset,
                            'maxrecords': maxrecords,
                            'outputschema': nsp
                        })
                    chunklist = list(src.records.items())
                    self.logger.debug(
                        " Got next %s records [%d,%d] from chunk " %
                        (nsp, choffset, choffset + chunklen))
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, CannotDisseminateFormat, Exception) as err:
                self.logger.error("%s : During harvest request %s\n" %
                                  (err, req))
                ##return -1

        # SparQL
        elif req["lverb"].startswith('Sparql'):
            outtypedir = 'hjson'
            outtypeext = 'json'
            startposition = 0
            maxrecords = 1000
            try:
                src = SPARQLWrapper(req['url'])
                harvestreq = getattr(src, 'query', 'format')  ##
                statement = '''
prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>
prefix prov: <http://www.w3.org/ns/prov#>
select (str(?submTime) as ?time) ?dobj ?spec ?dataLevel ?fileName ?submitterName where{
  ?dobj cpmeta:hasObjectSpec [rdfs:label ?spec ; cpmeta:hasDataLevel ?dataLevel].
  ?dobj cpmeta:hasName ?fileName .
  ?dobj cpmeta:wasSubmittedBy ?submission .
  ?submission prov:endedAtTime ?submTime .
  ?submission prov:wasAssociatedWith [cpmeta:hasName ?submitterName].
}
order by desc(?submTime)
limit 1000
'''
                src.setQuery(statement)
                src.setReturnFormat(JSON)
                records = harvestreq().convert()['results']['bindings']
            except (HTTPError, ConnectionError) as err:
                self.logger.critical("%s during connecting to %s\n" %
                                     (err, req['url']))
                return -1
            except (ImportError, CannotDisseminateFormat, Exception) as err:
                self.logger.critical("%s during harvest request %s\n" %
                                     (err, req))
                return -1

        else:
            self.logger.critical(' Not supported harvest type %s' %
                                 req["lverb"])
            sys.exit()

        self.logger.debug(" Harvest method used %s" % req["lverb"])
        try:
            if req["lverb"].startswith('List'):
                ntotrecs = len(list(rc))
            else:
                ntotrecs = len(records)
        except Exception as err:
            self.logger.error('%s Iteratation does not work ?' % (err))

        print("\t|- Retrieved %d records in %d sec - write %s files to disc" %
              (ntotrecs, time.time() - start, outtypeext.upper()))
        if ntotrecs == 0:
            self.logger.warning("\t|- Can not access any records to harvest")
            return -1

        self.logger.debug(' | %-4s | %-25s | %-25s |' %
                          ('#', 'OAI Identifier', 'DS Identifier'))
        start2 = time.time()

        if (not os.path.isdir(subsetdir + '/' + outtypedir)):
            os.makedirs(subsetdir + '/' + outtypedir)

        delete_ids = list()
        # loop over records
        for record in records:
            ## counter and progress bar
            stats['tcount'] += 1
            fcount += 1
            if fcount <= noffs: continue
            if ntotrecs > 0:
                perc = int(fcount * 100 / ntotrecs)
                bartags = int(perc / 5)
                if perc % 10 == 0 and perc != oldperc:
                    oldperc = perc
                    print("\r\t[%-20s] %5d (%3d%%) in %d sec" %
                          ('=' * bartags, fcount, perc, time.time() - start2))
                    sys.stdout.flush()

            # Set oai_id and generate a uniquely identifier for this dataset:
            delete_flag = False
            if req["lverb"] == 'dataset' or req["lverb"] == 'works' or req[
                    "lverb"] == 'records':  ## Harvest via JSON-API
                if 'key' in record:
                    oai_id = record['key']
                elif 'id' in record:
                    oai_id = record['id']

            elif req["lverb"] == 'csw':  ## Harvest via CSW2.0
                if hasattr(record, 'identifier'):
                    oai_id = record.identifier
                elif (record):
                    oai_id = record[0]
                else:
                    self.logger.critical(
                        'Record %s has no attrribute identifier %s' % record)

            elif req[
                    "lverb"] == 'ListIdentifiers':  ## OAI-PMH harvesting of XML records
                if (record.deleted):
                    stats['totdcount'] += 1
                    delete_flag = True
                    ##HEW-D continue
                else:
                    oai_id = record.identifier
                    record = sickle.GetRecord(
                        **{
                            'metadataPrefix': req['mdprefix'],
                            'identifier': record.identifier
                        })
            elif req["lverb"] == 'ListRecords':
                if (record.header.deleted):
                    stats['totdcount'] += 1
                    continue
                else:
                    oai_id = record.header.identifier
            elif req["lverb"].startswith('Sparql'):
                oai_id = record['fileName']['value']

            # generate a uniquely identifier and a filename for this dataset:
            uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, oai_id))
            outfile = '%s/%s/%s.%s' % (subsetdir, outtypedir,
                                       os.path.basename(uid), outtypeext)

            if delete_flag:  # record marked as deleted on provider site
                jsonfile = '%s/%s/%s.%s' % (subsetdir, 'json',
                                            os.path.basename(uid), 'json')
                # remove xml and json file:
                os.remove(xmlfile)
                os.remove(jsonfile)
                delete_ids.append(uid)

            # write record on disc
            try:
                self.logger.debug('    | h | %-4d | %-45s | %-45s |' %
                                  (stats['count'] + 1, oai_id, uid))
                self.logger.debug(
                    'Try to write the harvested JSON record to %s' % outfile)

                if outtypeext == 'xml':  # get and write the XML content:
                    if req["lverb"] == 'csw':
                        metadata = etree.fromstring(record[1].xml)
                    elif hasattr(record, 'raw'):
                        metadata = etree.fromstring(record.raw)
                    elif hasattr(record, 'xml'):
                        metadata = etree.fromstring(record.xml)

                    if (metadata is not None):
                        try:
                            metadata = etree.tostring(
                                metadata, pretty_print=True).decode('utf-8')
                        except (Exception, UnicodeEncodeError) as e:
                            self.logger.critical('%s : Metadata: %s ...' %
                                                 (e, metadata[:20]))
                        ##if PY2 :
                        ##    try:
                        ##        metadata = metadata.encode('utf-8')
                        ##    except (Exception,UnicodeEncodeError) as e :
                        ##        self.logger.debug('%s : Metadata : %s ...' % (e,metadata[20]))

                        try:
                            f = open(outfile, 'w')
                            f.write(metadata)
                            f.close
                        except (Exception, IOError) as err:
                            self.logger.critical(
                                "%s : Cannot write metadata in xml file %s" %
                                (err, outfile))
                            stats['ecount'] += 1
                            continue
                        else:
                            logging.debug('Harvested XML file written to %s' %
                                          outfile)
                            stats['count'] += 1
                    else:
                        stats['ecount'] += 1
                        self.logger.error('No metadata available for %s' %
                                          record)

                elif outtypeext == 'json':  # get the raw json content:
                    if (record is not None):
                        try:
                            with open(outfile, 'w') as f:
                                json.dump(record, f, sort_keys=True, indent=4)
                        except IOError:
                            logging.error(
                                "[ERROR] Cannot write metadata in out file '%s': %s\n"
                                % (outfile))
                            stats['ecount'] += 1
                            continue
                        else:
                            stats['count'] += 1
                            logging.debug('Harvested JSON file written to %s' %
                                          outfile)
                    else:
                        stats['ecount'] += 1
                        logging.warning(
                            '    [WARNING] No metadata available for %s' %
                            record['key'])  ##HEW-???' % oai_id)

            except TypeError as e:
                logging.error('    [ERROR] TypeError: %s' % e)
                stats['ecount'] += 1
                continue
            except Exception as e:
                logging.error("    [ERROR] %s and %s" %
                              (e, traceback.format_exc()))
                ## logging.debug(metadata)
                stats['ecount'] += 1
                continue

            # Next or last subset?
            if (stats['count'] == count_break) or (fcount == ntotrecs):
                print('       | %d records written to subset directory %s ' %
                      (stats['count'], subsetdir))

                # clean up current subset and write ids to remove to delete file
                for df in os.listdir(subsetdir + '/' + outtypedir):
                    df = os.path.join(subsetdir + '/' + outtypedir, df)
                    logging.debug('File to delete : %s' % df)
                    id = os.path.splitext(os.path.basename(df))[0]
                    jf = os.path.join(subsetdir + '/json/', id + '.json')
                    if os.stat(df).st_mtime < start - 1 * 86400:
                        os.remove(df)
                        logging.warning('File %s is deleted' % df)
                        if os.path.exists(jf):
                            os.remove(jf)
                            logging.warning('File %s is deleted' % jf)
                        delete_ids.append(id)
                        logging.warning('Append Id %s to list delete_ids' % id)
                        stats['dcount'] += 1

                print('       | %d records deleted from subset directory %s ' %
                      (stats['dcount'], subsetdir))

                if not fcount == ntotrecs:  # next subset neded
                    subsetdir = self.save_subset(req, stats, subset, count_set)
                    if (not os.path.isdir(subsetdir + '/' + outtypedir)):
                        os.makedirs(subsetdir + '/' + outtypedir)

                    count_set += 1

                # add all subset stats to total stats and reset the temporal subset stats:
                for key in ['tcount', 'ecount', 'count', 'dcount']:
                    stats['tot' + key] += stats[key]
                    stats[key] = 0

                    # start with a new time:
                    stats['timestart'] = time.time()

                logging.debug(
                    '    | %d records written to subset directory %s (if not failed).'
                    % (stats['count'], subsetdir))

        # path to the file with all ids to delete:
        delete_file = '/'.join([
            self.base_outdir, 'delete',
            req['community'] + '-' + req['mdprefix'] + '.del'
        ])
        if len(delete_ids) > 0:
            with open(delete_file, 'a') as file:
                for id in delete_ids:
                    file.write(id + '\n')

        # add all subset stats to total stats and reset the temporal subset stats:
        for key in ['tcount', 'ecount', 'count', 'dcount']:
            stats['tot' + key] += stats[key]

        print(
            '   \t|- %-10s |@ %-10s |\n\t| Provided | Harvested | Failed | Deleted |\n\t| %8d | %9d | %6d | %6d |'
            % ('Finished', time.strftime("%H:%M:%S"), stats['tottcount'],
               stats['totcount'], stats['totecount'], stats['totdcount']))
コード例 #13
0
ファイル: OAIRepository.py プロジェクト: axfelix/frdr_harvest
class OAIRepository(HarvestRepository):
    """ OAI Repository """
    def setRepoParams(self, repoParams):
        self.metadataprefix = "oai_dc"
        self.default_language = "en"
        super(OAIRepository, self).setRepoParams(repoParams)
        self.sickle = Sickle(self.url, iterator=FRDRItemIterator)

    def _crawl(self):
        records = []

        try:
            if self.set is None or self.set == "":
                records = self.sickle.ListRecords(
                    metadataPrefix=self.metadataprefix, ignore_deleted=True)
            else:
                records = self.sickle.ListRecords(
                    metadataPrefix=self.metadataprefix,
                    ignore_deleted=True,
                    set=self.set)
        except:
            self.logger.info("No items were found")

        kwargs = {
            "repo_id": self.repository_id,
            "repo_url": self.url,
            "repo_set": self.set,
            "repo_name": self.name,
            "repo_type": "oai",
            "enabled": self.enabled,
            "repo_thumbnail": self.thumbnail,
            "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days,
            "homepage_url": self.homepage_url,
            "repo_oai_name": self.repo_oai_name
        }
        self.repository_id = self.db.update_repo(**kwargs)
        item_count = 0

        while records:
            try:
                record = records.next()
                metadata = record.metadata

                # Search for a hyperlink in the list of identifiers
                if 'identifier' in metadata.keys():
                    if not isinstance(metadata['identifier'], list):
                        metadata['identifier'] = [metadata['identifier']]
                    for idt in metadata['identifier']:
                        # TODO - what about multiple identifiers? We should have some priority here, so we always pick the same one regardless of ordering
                        if idt.lower().startswith("http"):
                            metadata['dc:source'] = idt
                        if idt.lower().startswith("doi:"):
                            metadata[
                                'dc:source'] = "https://doi.org/" + idt[4:]
                        if idt.lower().startswith("hdl:"):
                            metadata[
                                'dc:source'] = "https://hdl.handle.net/" + idt[
                                    4:]

                # EPrints workaround for using header datestamp in lieu of date
                if 'date' not in metadata.keys() and record.header.datestamp:
                    metadata["date"] = record.header.datestamp

                # Use the header id for the database key (needed later for OAI GetRecord calls)
                metadata['identifier'] = record.header.identifier
                oai_record = self.unpack_oai_metadata(metadata)
                self.domain_metadata = self.find_domain_metadata(metadata)
                self.db.write_record(oai_record, self)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info(
                        "Done {} items after {} ({:.1f} items/sec)".format(
                            item_count, self.formatter.humanize(tdelta),
                            (item_count / tdelta)))

            except AttributeError:
                # probably not a valid OAI record
                # Islandora throws this for non-object directories
                self.logger.debug(
                    "AttributeError while working on item {}".format(
                        item_count))
                pass

            except StopIteration:
                break

        self.logger.info("Processed {} items in feed".format(item_count))

    def unpack_oai_metadata(self, record):
        record["pub_date"] = record.get("date")

        if self.metadataprefix.lower() == "ddi":
            # TODO: better DDI implementation that doesn't simply flatten everything, see: https://sickle.readthedocs.io/en/latest/customizing.html
            # Mapping as per http://www.ddialliance.org/resources/ddi-profiles/dc
            record["title"] = record.get("titl")
            record["creator"] = record.get("AuthEnty")
            record["tags"] = record.get("keyword", [])
            if "topcClas" in record.keys() and len(record["topcClas"]) > 0:
                record['tags'].extend(filter(None, record["topcClas"]))
            record["description"] = record.get("abstract")
            record["publisher"] = record.get("producer")
            record["contributor"] = record.get("othId")
            record["pub_date"] = record.get("prodDate")
            record["type"] = record.get("dataKind")
            record["identifier"] = record.get("IDNo")
            record["rights"] = record.get("copyright")

        if self.metadataprefix.lower() == "fgdc" or self.metadataprefix.lower(
        ) == "fgdc-std":
            record["creator"] = []
            for creator in record.get("origin"):
                if creator not in record["creator"]:
                    record["creator"].append(creator)
            record["tags"] = record.get("themekey")
            record["description"] = record.get("abstract")
            record["publisher"] = record.get("cntorg")
            # Put these dates in preferred order
            record["pub_date"] = [
                record.get("pubdate"),
                record.get("begdate"),
                record.get("enddate")
            ]
            record["type"] = record.get("geoform")
            record["dc:source"] = record.get("onlink")
            record["rights"] = record.get("distliab")
            record["access"] = record.get("accconst")

            if "placekt" in record.keys():
                record["coverage"] = record["placekt"]

            if "bounding" in record.keys():
                record["geobboxes"] = [{
                    "westLon": record["westbc"][0],
                    "eastLon": record["eastbc"][0],
                    "northLat": record["northbc"][0],
                    "southLat": record["southbc"][0]
                }]

        # Parse FRDR records
        if self.metadataprefix.lower() == "frdr":
            if "http://datacite.org/schema/kernel-4#geolocationPlace" in record.keys(
            ):
                record["coverage"] = record.get(
                    "http://datacite.org/schema/kernel-4#geolocationPlace")

            if "http://datacite.org/schema/kernel-4#geolocationPoint" in record.keys(
            ):
                record["geopoints"] = []
                for geopoint in record[
                        "http://datacite.org/schema/kernel-4#geolocationPoint"]:
                    point_split = re.compile(",? ").split(geopoint)
                    if len(point_split) == 2:
                        record["geopoints"].append({
                            "lat": point_split[0],
                            "lon": point_split[1]
                        })

            if "http://datacite.org/schema/kernel-4#geolocationBox" in record.keys(
            ):
                record["geobboxes"] = []
                for geobbox in record[
                        "http://datacite.org/schema/kernel-4#geolocationBox"]:
                    boxcoordinates = geobbox.split()
                    if len(boxcoordinates) == 4:
                        record["geobboxes"].append({
                            "southLat":
                            boxcoordinates[0],
                            "westLon":
                            boxcoordinates[1],
                            "northLat":
                            boxcoordinates[2],
                            "eastLon":
                            boxcoordinates[3]
                        })

            # Look for datacite.creatorAffiliation
            if "http://datacite.org/schema/kernel-4#creatorAffiliation" in record:
                record["affiliation"] = record.get(
                    "http://datacite.org/schema/kernel-4#creatorAffiliation")

        if 'identifier' not in record.keys():
            return None
        if record["pub_date"] is None:
            return None

        # If there are multiple identifiers, and one of them contains a link, then prefer it
        # Otherwise just take the first one
        if isinstance(record["identifier"], list):
            valid_id = record["identifier"][0]
            for idstring in record["identifier"]:
                if "http" in idstring.lower():
                    valid_id = idstring
            record["identifier"] = valid_id
        if 'creator' not in record.keys() and 'contributor' not in record.keys(
        ) and 'publisher' not in record.keys():
            self.logger.debug(
                "Item {} is missing creator - will not be added".format(
                    record["identifier"]))
            return None
        elif 'creator' not in record.keys() and 'contributor' in record.keys():
            record["creator"] = record["contributor"]
        elif 'creator' not in record.keys() and 'publisher' in record.keys():
            record["creator"] = record["publisher"]
        # Workaround for WOUDC, which doesn't attribute individual datasets
        elif self.metadataprefix.lower() == "fgdc-std":
            record["creator"] = self.name

        # If date is undefined add an empty key
        if 'pub_date' not in record.keys():
            record["pub_date"] = ""

        # If there are multiple dates choose the longest one (likely the most specific)
        # If there are a few dates with the same length the first one will be used, which assumes we grabbed them in a preferred order
        # Exception test added for some strange PDC dates of [null, null]
        if isinstance(record["pub_date"], list):
            valid_date = record["pub_date"][0] or ""
            for datestring in record["pub_date"]:
                if datestring is not None:
                    if len(datestring) > len(valid_date):
                        valid_date = datestring
            record["pub_date"] = valid_date

        # If date is still a one-value list, make it a string
        if isinstance(record["pub_date"], list):
            record["pub_date"] = record["pub_date"][0]
        # If a date has question marks, chuck it
        if "?" in record["pub_date"]:
            return None

        try:
            date_object = dateparser.parse(record["pub_date"])
            if date_object is None:
                date_object = dateparser.parse(record["pub_date"],
                                               date_formats=['%Y%m%d'])
            record["pub_date"] = date_object.strftime("%Y-%m-%d")
        except:
            self.logger.debug(
                "Something went wrong parsing the date, {} from {}",
                record["pub_date"],
                (record["dc:source"]
                 if record["identifier"] is None else record["identifier"]))
            return None

        if "title" not in record.keys():
            return None

        language = self.default_language
        if "language" in record.keys():
            if isinstance(record["language"], list):
                record["language"] = record["language"][0].strip()
                record["language"] = record["language"].lower()
            if record["language"] in ["fr", "fre", "fra", "french"]:
                language = "fr"

        if language == "fr":
            if isinstance(record["title"], list):
                record["title_fr"] = record["title"][0].strip()
            else:
                record["title_fr"] = record["title"].strip()
            # Remove "title" from record since this is the English field
            record["title"] = ""

            if "tags_fr" not in record.keys():
                record["tags_fr"] = record.get("subject")
                record.pop("subject", None)
        else:
            if isinstance(record["title"], list):
                record["title"] = record["title"][0].strip()
            else:
                record["title"] = record["title"].strip()
            record["title_fr"] = ""

            if "tags" not in record.keys():
                record["tags"] = record.get("subject")
                record.pop("subject", None)

        if "publisher" in record.keys():
            if isinstance(record["publisher"], list):
                record["publisher"] = record["publisher"][0]

        if "series" not in record.keys():
            record["series"] = ""

        if "coverage" in record.keys():
            record["geoplaces"] = []
            if self.name == "SFU Radar":
                record["coverage"] = [
                    x.strip() for x in record["coverage"][0].split(";")
                ]
            if not isinstance(record["coverage"], list):
                record["coverage"] = [record["coverage"]]
            for place_name in record["coverage"]:
                if place_name != "" and place_name.lower().islower(
                ):  # to filter out dates, confirm at least one letter
                    record["geoplaces"].append({"place_name": place_name})

        # DSpace workaround to exclude theses and non-data content
        if self.prune_non_dataset_items:
            if record["type"] and "Dataset" not in record["type"]:
                return None

        # EPrints workaround to fix duplicates and Nones in Rights
        if "rights" in record.keys() and isinstance(record["rights"], list):
            record["rights"] = list(set(filter(None.__ne__, record["rights"])))

        # EPrints workaround for liberal use of dc:identifier
        # Rather not hardcode a single source URL for this
        if self.url == "http://spectrum.library.concordia.ca/cgi/oai2":
            for relation in record["relation"]:
                if "http://spectrum.library.concordia.ca" in relation:
                    record["dc:source"] = relation

        return record

    def find_domain_metadata(self, record):
        excludedElements = [
            'http://datacite.org/schema/kernel-4#resourcetype',
            'http://datacite.org/schema/kernel-4#creatorAffiliation',
            'http://datacite.org/schema/kernel-4#publicationyear',
            'https://www.frdr-dfdr.ca/schema/1.0/#globusEndpointName',
            'https://www.frdr-dfdr.ca/schema/1.0/#globusEndpointPath'
        ]
        newRecord = {}
        for elementName in list(record.keys()):
            if '#' in elementName:
                if not [
                        ele for ele in excludedElements if (ele in elementName)
                ]:
                    newRecord[elementName] = record.pop(elementName, None)
        return newRecord

    @rate_limited(5)
    def _update_record(self, record):
        #self.logger.debug("Updating OAI record {}".format(record['local_identifier']))

        try:
            single_record = self.sickle.GetRecord(
                identifier=record["local_identifier"],
                metadataPrefix=self.metadataprefix)

            try:
                metadata = single_record.metadata
                if 'identifier' in metadata.keys() and isinstance(
                        metadata['identifier'], list):
                    if "http" in metadata['identifier'][0].lower():
                        metadata['dc:source'] = metadata['identifier'][0]
            except AttributeError:
                metadata = {}

            # EPrints workaround for using header datestamp in lieu of date
            if 'date' not in metadata.keys(
            ) and single_record.header.datestamp:
                metadata["date"] = single_record.header.datestamp

            metadata['identifier'] = single_record.header.identifier
            oai_record = self.unpack_oai_metadata(metadata)
            self.domain_metadata = self.find_domain_metadata(metadata)
            if oai_record is None:
                self.db.delete_record(record)
                return False
            self.db.write_record(oai_record, self)
            return True

        except IdDoesNotExist:
            # Item no longer in this repo
            self.db.delete_record(record)
            return True

        except Exception as e:
            self.logger.error(
                "Updating item failed (repo_id:{}, oai_id:{}): {}".format(
                    self.repository_id, record['local_identifier'], e))
            if self.dump_on_failure == True:
                try:
                    print(single_record.metadata)
                except:
                    pass
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False
コード例 #14
0
from oaiharvests.utils import *
from oaiharvests.models import *

from sickle import Sickle

community = Community.objects.all()[0]
batch_harvest_issues(community)

collection = Collection.objects.all()[7]
record = collection.record_set.all()[5]
record.hdr_identifier
try:
    sickle = Sickle(collection.community.repository.base_url)
    sickle.class_mapping['GetRecord'] = LltRecordBitstream
    record = sickle.GetRecord(metadataPrefix='ore',
                              identifier=record.identifier)
    print type(record)
    # print record.metadata['bitstream'][0].replace('+', '%20')

except Exception as e:
    print e, 'Unable to construct bitstream url.'

#
コード例 #15
0
class OAIClient:
    def __init__(self, url, source_name, max_retries=3):
        self.sickle = Sickle(url, max_retries=max_retries, verify=False)
        self.sickle.class_mapping['ListRecords'] = SciELORecord
        self.sickle.class_mapping['GetRecord'] = SciELORecord
        self.source_name = source_name

    def get_record(self, metadata_prefix='oai_dc_scielo', identifier=None):
        if identifier:
            return [
                self.sickle.GetRecord(**{
                    'metadataPrefix': metadata_prefix,
                    'identifier': identifier
                })
            ]

    def get_records(self,
                    metadata_prefix='oai_dc_scielo',
                    from_date='',
                    until_date=''):
        try:
            from_date = datetime.strptime(from_date, '%Y-%m-%d')
            until_date = datetime.strptime(until_date, '%Y-%m-%d')
        except ValueError:
            raise exceptions.InvalidDateFormatError(
                'Formato de datas inválido')

        if from_date >= until_date:
            raise exceptions.InvalidDateRangeError(
                'Data de início é maior ou igual a data de fim')

        logging.info(
            f'Collecting data from {from_date.strftime("%Y-%m-%d")} to {until_date.strftime("%Y-%m-%d")}'
        )

        try:
            records = self.sickle.ListRecords(
                **{
                    'metadataPrefix': metadata_prefix,
                    'from': from_date.strftime('%Y-%m-%d'),
                    'until': until_date.strftime('%Y-%m-%d')
                })
        except NoRecordsMatch:
            logging.info('No records found')
            return []
        except (
                ConnectionError,
                ConnectionResetError,
                ConnectionAbortedError,
                ConnectionRefusedError,
                MaxRetryError,
                HTTPError,
                TimeoutError,
        ) as e:
            logging.error(e)
            return []

        return records

    def record_to_dict(self, record: SciELORecord):
        object = {}

        object['gathering_date'] = datetime.utcnow()
        object['gathering_source'] = self.source_name
        object['identifier'] = record.header.identifier
        object['date'] = record.header.date
        object['is_part_of'] = record.header.is_part_of
        object['metadata'] = record.get_metadata().get('metadata', {})

        return object
コード例 #16
0
# oai = OAIUtils()
# oai.list_oai_collections(com)

base_url = 'http://scholarspace.manoa.hawaii.edu/dspace-oai/request'
llt_id = 'com_10125_27123'

s = Sickle(base_url)

record_headers = list(s.ListIdentifiers(metadataPrefix='oai_dc', set=llt_id))

community_collections = {}
for i in record_headers:
    # Iterate over associated sets looking for collections
    for j in i.setSpecs:
        if j[:3] == 'col':
            community_collections[j] = None  # register id in map

for i in s.ListSets():
    try:
        print community_collections[i.setSpec]
        community_collections[i.setSpec] = i.setName
        print i.setSpec, '==>', community_collections[i.setSpec]
        print i
    except KeyError as e:
        pass
        # print e, 'not a collection in llt ...'

sample = 'oai:scholarspace.manoa.hawaii.edu:10125/54329'
s.GetRecord(identifier=sample, metadataPrefix='oai_dc')
コード例 #17
0
class OAISynchronizer(OAIDBBase):
    """

    """
    def __init__(self,
                 provider: OAIProvider,
                 parser_name: str = None,
                 unhandled_paths: set = None,
                 validation: Callable = None,
                 create_record: Callable = None,
                 delete_record: Callable = None,
                 update_record: Callable = None,
                 pid_type: str = None,
                 oai_identifiers: List[str] = None):
        super().__init__(provider)
        self.pid_type = pid_type
        self.provider = provider
        self.oai_sync = None
        self.sickle = Sickle(self.provider.oai_endpoint)
        registry.load()
        self.parsers = provider.get_parsers()
        self.rules = provider.get_rules(parser_name) or {}
        self.parser = self.parsers.get(parser_name) or {}
        self.transformer = OAITransformer(self.rules,
                                          unhandled_paths=unhandled_paths)
        self.validation_handler = validation
        self.create_record_handler = create_record
        self.update_record_handler = update_record
        self.delete_record_handler = delete_record
        self.oai_identifiers = oai_identifiers

    def run(self,
            start_oai: str = None,
            start_id: int = None,
            break_on_error: bool = True):
        """

        :return:
        :rtype:
        """
        self.ensure_migration()
        super().run(start_oai=start_oai,
                    start_id=start_id,
                    break_on_error=break_on_error)

    def synchronize(self,
                    identifiers=None,
                    start_oai: str = None,
                    start_id: int = None,
                    break_on_error: bool = True):
        """

        :return:
        :rtype:
        """
        oai_logger.info(
            f"OAI harvester on endpoint: {self.provider.oai_endpoint} has started!"
        )

        if identifiers is None:
            if self.oai_identifiers is None:
                identifiers = self._get_oai_identifiers()
            else:
                identifiers = self._get_oai_identifiers(
                    identifiers_list=self.oai_identifiers)

        identifiers = islice(identifiers, start_id, None)
        collect = False
        for idx, identifier in enumerate(identifiers, start=start_id):
            oai_logger.info(f"{idx}. Record, OAI ID: '{identifier}'")
            datestamp = identifier.datestamp
            oai_identifier = identifier.identifier
            if not start_oai or oai_identifier == start_oai:
                collect = True
            if not collect:
                continue
            deleted = identifier.deleted
            try:
                if deleted:
                    self._delete(identifier, oai_identifier)
                else:
                    try:
                        self.update(oai_identifier, datestamp)
                    except IdDoesNotExist:
                        self._delete(identifier, oai_identifier)
                if idx % 100:
                    db.session.commit()
            except Exception:
                exc = traceback.format_exc()
                print(exc, "\n\n\n")
                oai_exc = OAIRecordExc.query.filter_by(
                    oai_identifier=oai_identifier,
                    oai_sync_id=self.oai_sync.id).one_or_none()
                if not oai_exc:
                    oai_exc = OAIRecordExc(oai_identifier=oai_identifier,
                                           traceback=exc,
                                           oai_sync_id=self.oai_sync.id)
                    db.session.add(oai_exc)
                else:
                    oai_exc.traceback = exc
                db.session.commit()
                if break_on_error:
                    raise
                continue

    def _delete(self, identifier, oai_identifier):
        self.delete(oai_identifier)
        self.deleted += 1
        oai_logger.info(
            f"Identifier '{identifier}' has been marked as deleted")

    def _get_oai_identifiers(self,
                             sickle=None,
                             metadata_prefix=None,
                             set_=None,
                             identifiers_list: List[str] = None):
        if identifiers_list:
            return [
                self.sickle.GetRecord(
                    identifier=identifier,
                    metadataPrefix=self.provider.metadata_prefix).header
                for identifier in identifiers_list
            ]
        if not sickle:
            sickle = self.sickle
        if not metadata_prefix:
            metadata_prefix = self.provider.metadata_prefix
        if not set_:
            set_ = self.provider.set_
        return sickle.ListIdentifiers(metadataPrefix=metadata_prefix, set=set_)

    def update(self, oai_identifier, datestamp):
        """

        :param oai_identifier:
        :type oai_identifier:
        :param datestamp:
        :type datestamp:
        :return:
        :rtype:
        """
        xml = self.get_xml(oai_identifier)
        parsed = self.parse(xml)
        transformed = self.transform(parsed)
        transformed.update(self.provider.constant_fields)
        if self.validation_handler:
            self.validation_handler(transformed)

        oai_rec = OAIRecord.query.filter_by(
            oai_identifier=oai_identifier).one_or_none()
        if oai_rec is None:
            transformed = self.attach_id(transformed)
            record = self.create_record(transformed)
            oai_rec = OAIRecord(id=record.id,
                                oai_identifier=oai_identifier,
                                creation_sync_id=self.oai_sync.id,
                                nusl_id=transformed["id"])
            self.created += 1
            db.session.add(oai_rec)
            oai_logger.info(
                f"Identifier '{oai_identifier}' has been created and '{record.id}' has been "
                f"assigned as a UUID")
        else:
            transformed = self.attach_id(transformed, nusl_id=oai_rec.nusl_id)
            record = self.update_record(transformed)
            self.modified += 1
            oai_rec.modification_sync_id = self.oai_sync.id
            oai_logger.info(
                f"Identifier '{oai_identifier}' has been updated (UUID: {record.id})"
            )
        oai_rec.last_sync_id = self.oai_sync.id
        oai_rec.timestamp = datestamp
        nusl_theses.index_draft_record(record)

    def transform(self, parsed, handler=None):
        if not handler:
            handler = self.transformer.transform
        return handler(parsed)

    def get_xml(self, oai_identifier):
        original_record = self.sickle.GetRecord(
            identifier=oai_identifier,
            metadataPrefix=self.provider.metadata_prefix)
        return original_record.xml

    def parse(self, xml_etree, parser=None):
        if not parser or not callable(parser):
            if self.parser:
                parser = self.parser
            if parser is None:
                raise ParserNotFoundError(
                    "No parser specified, please check entry points and parser designation by "
                    "decorator @Decorators.parser or specify parser as function parameter."
                )
        return parser(xml_etree)

    def create_record(self, metadata):
        """

        :return:
        :rtype:
        """
        if self.create_record_handler:
            record = self.create_record_handler(metadata,
                                                pid_type=self.pid_type)
            return record
        else:
            raise HandlerNotFoundError(
                'Please specify create handler during initialization. Must specify '
                '"create_record" named parameter')

    def update_record(self, metadata):
        """


        :return:
        :rtype:
        """
        if self.update_record_handler:
            existing_record = nusl_theses.get_record_by_id(
                self.pid_type, metadata["id"])
            return self.update_record_handler(existing_record, metadata)
        else:
            raise HandlerNotFoundError(
                'Please specify update handler during initialization. Must specify '
                '"update_record" named parameter')

    def delete(self, oai_identifier):
        """

        :param oai_identifier:
        :type oai_identifier:
        :return:
        :rtype:
        """
        if self.delete_record_handler:
            oai_record = OAIRecord.query.filter_by(
                oai_identifier=oai_identifier).one_or_none()
            if not oai_record:
                return
            record = nusl_theses.get_record_by_id(pid_type=self.pid_type,
                                                  pid_value=oai_record.nusl_id)
            self.delete_record_handler(record)
        else:
            raise HandlerNotFoundError(
                'Please specify delete handler during initialization. Must specify '
                '"delete_record" named parameter')

    @staticmethod
    def ensure_migration():
        # TODO: Zlepšit kontrolu zda proběhla migrace úspěšně
        oai_record_count = OAIRecord.query.count()
        records_count = RecordMetadata.query.count()
        if records_count > 0 and oai_record_count == 0:
            raise NoMigrationError(
                "There are records presents in database, but no OAIRecord found. Please ensure "
                "that you run migration script")

    @staticmethod
    def attach_id(transformed, nusl_id=None):
        if not nusl_id:
            nusl_id = str(nusl_theses.get_new_pid())
        transformed["id"] = nusl_id
        transformed["identifier"].append({"type": "nusl", "value": nusl_id})
        return transformed
コード例 #18
0
ファイル: OAIRepository.py プロジェクト: dnarc/globus_oai
class OAIRepository(HarvestRepository):
    """ OAI Repository """
    def setRepoParams(self, repoParams):
        self.metadataprefix = "oai_dc"
        super(OAIRepository, self).setRepoParams(repoParams)
        self.sickle = Sickle(self.url, iterator=FRDRItemIterator)

    def _crawl(self):
        records = []

        try:
            if self.set is None or self.set == "":
                records = self.sickle.ListRecords(
                    metadataPrefix=self.metadataprefix, ignore_deleted=True)
            else:
                records = self.sickle.ListRecords(
                    metadataPrefix=self.metadataprefix,
                    ignore_deleted=True,
                    set=self.set)
        except:
            self.logger.info("No items were found")

        kwargs = {
            "repo_id": self.repository_id,
            "repo_url": self.url,
            "repo_set": self.set,
            "repo_name": self.name,
            "repo_type": "oai",
            "enabled": self.enabled,
            "repo_thumbnail": self.thumbnail,
            "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days,
            "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)
        item_count = 0

        while records:
            try:
                record = records.next()
                metadata = record.metadata

                # Search for a hyperlink in the list of identifiers
                if 'identifier' in metadata.keys():
                    if not isinstance(metadata['identifier'], list):
                        metadata['identifier'] = [metadata['identifier']]
                    for idt in metadata['identifier']:
                        # TODO - what about multiple identifiers? We should have some priority here, so we always pick the same one regardless of ordering
                        if idt.lower().startswith("http"):
                            metadata['dc:source'] = idt
                        if idt.lower().startswith("doi:"):
                            metadata[
                                'dc:source'] = "https://dx.doi.org/" + idt[4:]
                        if idt.lower().startswith("hdl:"):
                            metadata[
                                'dc:source'] = "https://hdl.handle.net/" + idt[
                                    4:]

                # EPrints workaround for using header datestamp in lieu of date
                if 'date' not in metadata.keys() and record.header.datestamp:
                    metadata["date"] = record.header.datestamp

                # Use the header id for the database key (needed later for OAI GetRecord calls)
                metadata['identifier'] = record.header.identifier
                oai_record = self.unpack_oai_metadata(metadata)
                domain_metadata = self.find_domain_metadata(metadata)

                self.db.write_record(oai_record, self.repository_id,
                                     self.metadataprefix.lower(),
                                     domain_metadata)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info(
                        "Done {} items after {} ({:.1f} items/sec)".format(
                            item_count, self.formatter.humanize(tdelta),
                            (item_count / tdelta)))

            except AttributeError:
                # probably not a valid OAI record
                # Islandora throws this for non-object directories
                self.logger.debug(
                    "AttributeError while working on item {}".format(
                        item_count))
                pass

            except StopIteration:
                break

        self.logger.info("Processed {} items in feed".format(item_count))

    def unpack_oai_metadata(self, record):
        record["pub_date"] = record.get("date")

        if self.metadataprefix.lower() == "ddi":
            # TODO: better DDI implementation that doesn't simply flatten everything, see: https://sickle.readthedocs.io/en/latest/customizing.html
            # Mapping as per http://www.ddialliance.org/resources/ddi-profiles/dc
            record["title"] = record.get("titl")
            record["creator"] = record.get("AuthEnty")
            record["subject"] = record.get("keyword", [])
            if "topcClas" in record.keys() and len(record["topcClas"]) > 0:
                record['subject'].extend(filter(None, record["topcClas"]))
            record["description"] = record.get("abstract")
            record["publisher"] = record.get("producer")
            record["contributor"] = record.get("othId")
            record["pub_date"] = record.get("prodDate")
            record["type"] = record.get("dataKind")
            record["identifier"] = record.get("IDNo")
            record["rights"] = record.get("copyright")

            if "northBL" in record.keys():
                # This record has geoSpatial bounding lines
                # Convert into an array of closed bounding box points (clockwise polygon)
                record["geospatial"] = {
                    "type":
                    "Polygon",
                    "coordinates":
                    [[[record["northBL"][0], record["westBL"][0]],
                      [record["northBL"][0], record["eastBL"][0]],
                      [record["southBL"][0], record["westBL"][0]],
                      [record["southBL"][0], record["eastBL"][0]]]]
                }

        if self.metadataprefix.lower() == "fgdc" or self.metadataprefix.lower(
        ) == "fgdc-std":
            record["creator"] = record.get("origin")
            record["subject"] = record.get("themekey")
            record["description"] = record.get("abstract")
            record["publisher"] = record.get("cntorg")
            # Put these dates in preferred order
            record["pub_date"] = [
                record.get("pubdate"),
                record.get("begdate"),
                record.get("enddate")
            ]
            record["type"] = record.get("geoform")
            record["dc:source"] = record.get("onlink")
            record["rights"] = record.get("distliab")
            record["access"] = record.get("accconst")

            if "bounding" in record.keys():
                # Sometimes point data is hacked in as a bounding box
                if record["westbc"] == record["eastbc"] and record[
                        "northbc"] == record["southbc"]:
                    record["geospatial"] = {
                        "type":
                        "Point",
                        "coordinates":
                        [[[record["northbc"][0], record["westbc"][0]]]]
                    }
                else:
                    record["geospatial"] = {
                        "type":
                        "Polygon",
                        "coordinates":
                        [[[record["northbc"][0], record["westbc"][0]],
                          [record["northbc"][0], record["eastbc"][0]],
                          [record["southbc"][0], record["westbc"][0]],
                          [record["southbc"][0], record["eastbc"][0]]]]
                    }

        # Parse FRDR records
        if self.metadataprefix.lower() == "frdr":
            record["coverage"] = record.get("geolocationPlace")

            if "geolocationPoint" in record.keys():
                point_split = re.compile(",? ").split(
                    record["geolocationPoint"][0])
                record["geospatial"] = {
                    "type": "Point",
                    "coordinates": [[point_split]]
                }

            if "geolocationBox" in record.keys():
                boxcoordinates = record["geolocationBox"][0].split()
                record["geospatial"] = {
                    "type":
                    "Polygon",
                    "coordinates": [[
                        boxcoordinates[x:x + 2]
                        for x in range(0, len(boxcoordinates), 2)
                    ]]
                }
            # Look for datacite.creatorAffiliation
            if "creatorAffiliation" in record:
                record["affiliation"] = record.get("creatorAffiliation")

        if 'identifier' not in record.keys():
            return None

        if record["pub_date"] is None:
            return None

        # If there are multiple identifiers, and one of them contains a link, then prefer it
        # Otherwise just take the first one
        if isinstance(record["identifier"], list):
            valid_id = record["identifier"][0]
            for idstring in record["identifier"]:
                if "http" in idstring.lower():
                    valid_id = idstring
            record["identifier"] = valid_id

        if 'creator' not in record.keys() and 'contributor' not in record.keys(
        ) and 'publisher' not in record.keys():
            self.logger.debug(
                "Item {} is missing creator - will not be added".format(
                    record["identifier"]))
            return None
        elif 'creator' not in record.keys() and 'contributor' in record.keys():
            record["creator"] = record["contributor"]
        elif 'creator' not in record.keys() and 'publisher' in record.keys():
            record["creator"] = record["publisher"]
        # Workaround for WOUDC, which doesn't attribute individual datasets
        elif self.metadataprefix.lower() == "fgdc-std":
            record["creator"] = self.name

        # If date is undefined add an empty key
        if 'pub_date' not in record.keys():
            record["pub_date"] = ""

        # If there are multiple dates choose the longest one (likely the most specific)
        # If there are a few dates with the same length the first one will be used, which assumes we grabbed them in a preferred order
        # Exception test added for some strange PDC dates of [null, null]
        if isinstance(record["pub_date"], list):
            valid_date = record["pub_date"][0] or ""
            for datestring in record["pub_date"]:
                if datestring is not None:
                    if len(datestring) > len(valid_date):
                        valid_date = datestring
            record["pub_date"] = valid_date

        # If date is still a one-value list, make it a string
        if isinstance(record["pub_date"], list):
            record["pub_date"] = record["pub_date"][0]

        # Convert long dates into YYYY-MM-DD
        datestring = re.search("(\d{4}[-/]?\d{2}[-/]?\d{2})",
                               record["pub_date"])
        if datestring:
            record["pub_date"] = datestring.group(0).replace("/", "-")

        # If dates are entirely numeric, add separators
        if not re.search("\D", record["pub_date"]):
            if (len(record["pub_date"]) == 6):
                record["pub_date"] = record["pub_date"][0] + record["pub_date"][1] + record["pub_date"][2] + \
                                     record["pub_date"][3] + "-" + record["pub_date"][4] + record["pub_date"][5]
            if (len(record["pub_date"]) == 8):
                record["pub_date"] = record["pub_date"][0] + record["pub_date"][1] + record["pub_date"][2] + \
                                     record["pub_date"][3] + "-" + record["pub_date"][4] + record["pub_date"][5] + "-" + \
                                     record["pub_date"][6] + record["pub_date"][7]

        # If a date has question marks, chuck it
        if "?" in record["pub_date"]:
            return None

        # Make sure dates are valid
        if not re.search(
                "^(1|2)\d{3}(-?(0[1-9]|1[0-2])(-?(0[1-9]|1[0-9]|2[0-9]|3[0-1]))?)?$",
                record["pub_date"]):
            self.logger.debug("Invalid date for record {}".format(
                record["dc:source"]))
            return None

            # record["pub_date"] = dateparser.parse(record["pub_date"]).strftime("%Y-%m-%d")

        if "title" not in record.keys():
            return None
        if isinstance(record["title"], list):
            record["title"] = record["title"][0]

        if "contact" not in record.keys():
            record["contact"] = ""
            if "publisher" in record.keys():
                if isinstance(record["publisher"], list):
                    record["publisher"] = record["publisher"][0]
                if record["publisher"] is not None:
                    contact_address = re.search(
                        r"[\w\.-]+@([\w-]+\.)+[\w-]{2,4}", record["publisher"])
                    try:
                        record["contact"] = contact_address.group(0)
                    except:
                        pass
        if isinstance(record["contact"], list):
            record["contact"] = record["contact"][0]

        if "series" not in record.keys():
            record["series"] = ""

        # DSpace workaround to exclude theses and non-data content
        if self.prune_non_dataset_items:
            if record["type"] and "Dataset" not in record["type"]:
                return None

        # EPrints workaround to fix duplicates and Nones in Rights
        if "rights" in record.keys() and isinstance(record["rights"], list):
            record["rights"] = list(set(filter(None.__ne__, record["rights"])))

        # EPrints workaround for liberal use of dc:identifier
        # Rather not hardcode a single source URL for this
        if self.url == "http://spectrum.library.concordia.ca/cgi/oai2":
            for relation in record["relation"]:
                if "http://spectrum.library.concordia.ca" in relation:
                    record["dc:source"] = relation

        return record

    def find_domain_metadata(self, record):
        newRecord = {}
        for elementName in list(record.keys()):
            if '#' in elementName:
                newRecord[elementName] = record.pop(elementName, None)
        return newRecord

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self, record):
        self.logger.debug("Updating OAI record {}".format(
            record['local_identifier']))

        try:
            single_record = self.sickle.GetRecord(
                identifier=record["local_identifier"],
                metadataPrefix=self.metadataprefix)

            try:
                metadata = single_record.metadata
                if 'identifier' in metadata.keys() and isinstance(
                        metadata['identifier'], list):
                    if "http" in metadata['identifier'][0].lower():
                        metadata['dc:source'] = metadata['identifier'][0]
            except AttributeError:
                metadata = {}

            # EPrints workaround for using header datestamp in lieu of date
            if 'date' not in metadata.keys(
            ) and single_record.header.datestamp:
                metadata["date"] = single_record.header.datestamp

            metadata['identifier'] = single_record.header.identifier
            oai_record = self.unpack_oai_metadata(metadata)
            domain_metadata = self.find_domain_metadata(metadata)
            if oai_record is None:
                self.db.delete_record(record)
                return False
            self.db.write_record(oai_record, self.repository_id,
                                 self.metadataprefix.lower(), domain_metadata)
            return True

        except IdDoesNotExist:
            # Item no longer in this repo
            self.db.delete_record(record)
            return True

        except Exception as e:
            self.logger.error(
                "Updating item failed (repo_id:{}, oai_id:{}): {}".format(
                    self.repository_id, record['local_identifier'], e))
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False
コード例 #19
0
ファイル: oai.py プロジェクト: UB-UNIBAS/rdb-harvest
class OAIHarvester(object):
    """Downloads files from a OAI-PMH 2.0 API and stores them as xml."""

    def __init__(self, base_url: str, metadata_prefix: str, path: str,
                 base_file_name='harvest-result', user='', password='',
                 logger=logging.getLogger('oai'), encoding='iso-8859-1'):
        """
        Configure a basic connection to the OAI-Server. Sets up the sickle instance with appropriate settings
        and checks if the metadata prefix is valid. Creates a directory at path if no such path exists.

        :param base_url:        Base url for the oai request without http://
        :param metadata_prefix:  Metadata-Prefix for the api_response to be harvested.
        :param path:            Directory path where the files should be stored.
        :param base_file_name:  Downloads are saved in this file. If several downloads are made the resumption token
                                or a random number is added.
        :param user:            User name for basic http authentication (unescaped)
        :param password:        Password for basic http authentication (unescaped)
        :param logger:          Logger used to log all actions and errors of this class.
        :param encoding:        The encoding used to store elements

        :raises InvalidPrefixError if the given prefix is not valid.
        """
        self.encoding = encoding
        self.logger = logger
        self.use_authentication = False
        if user != '':
            assert password != ''
            self.user = urllib.parse.quote(user)
            self.encoded_password = urllib.parse.quote(password)
            self.use_authentication = True
            self.logger.info('Uses authentication with credentials: user: %s, password: %s.',
                             self.user, self.encoded_password)
        else:
            self.logger.info('No authentication given.')

        self.url = base_url
        self.path = path
        self.base_file_name = base_file_name
        self.metadataPrefix = metadata_prefix
        self.api_response = None
        self.data = list()

        if self.use_authentication:
            self.sickle = Sickle('https://' + self.user + ':' + self.encoded_password + '@' + self.url,
                                 iterator=OAIResponseIterator)
        else:
            self.sickle = Sickle('https://' + self.url, iterator=OAIResponseIterator)

        self._verify_metadata_prefix()

        if not os.path.exists(self.path):
            self.logger.info('Create directory at %s.', self.path)
            os.makedirs(self.path)

    def _verify_metadata_prefix(self):
        """
        Verifies that the used metadata prefix is valid for this OAI repository.

        :raises InvalidPrefixError  if the given prefix is not valid.
        """
        # changes the sickle iterator to item to easily access metadata prefix.
        self.sickle.iterator = OAIItemIterator
        valid_prefix_list = list()
        metadata = self.sickle.ListMetadataFormats()
        is_valid_prefix = False
        while True:
            try:
                prefix = metadata.next().metadataPrefix
            except StopIteration:
                break
            valid_prefix_list.append(prefix)
            if prefix == self.metadataPrefix:
                is_valid_prefix = True

        if not is_valid_prefix:
            self.logger.critical('Given metadata prefix (%s) was not valid. Select one of these: %s',
                                 self.metadataPrefix, str(valid_prefix_list))
            raise InvalidPrefixError('Invalid metadataPrefix: ' + self.metadataPrefix + '.\n' +
                                     ' A list of the available prefixes: ' + str(valid_prefix_list))
        else:
            self.logger.info('The prefix given is valid.')

    def store_records(self, set_id=None, date=None, ignore_deleted=False):
        """
        Downloads all records found on the OAI-API or all records from a given set.

        :param set_id:          determine what set to download if a given set should be downloaded (default None)
        :type set_id:           str
        :param date:            Only records added/changed after this date will be downloaded (default None)
        :type date:             str 'YYYY-MM-DD'
        :param ignore_deleted:  When true ignores all deleted records. This may not be a
                                feature available in all OAI archives.
        :type ignore_deleted    bool
        """
        self.sickle.iterator = OAIResponseIterator
        params = {'metadataPrefix': self.metadataPrefix, 'from': date, 'set': set_id, 'ignore_deleted': ignore_deleted}
        self.api_response = self.sickle.ListRecords(**params)
        self._write_all_records()

    def store_record(self, identifier: int):
        """
        Downloads a single record with the given id and stores it in a file at the given place.

        :param identifier: the id which should be retrieved.
        """
        self.sickle.iterator = OAIResponseIterator
        record = self.sickle.GetRecord(identifier=identifier, metadataPrefix=self.metadataPrefix)
        temp_xml = record.raw
        with open(self.path + self.base_file_name + str(identifier) + '.xml', 'w', encoding=self.encoding) as file:
            file.write(temp_xml)

    def iterate_sets(self):
        """Iterate through all sets available at the OAI repository.

        :return List of all sets as tupels (id, name)
        :rtype: iterator tuple (str, str)
        """
        self.sickle.iterator = OAIItemIterator
        try:
            sets = self.sickle.ListSets()
            for s in sets:
                yield (s.setSpec, s.setName)
        except NoSetHierarchy as error:
            self.logger.warning(str(error))
            raise NoSetHierarchy(error)

    def _write_all_records(self):
        """Writes all downloaded api_response into xml files."""
        if self.api_response is None:
            self.logger.critical('No response loaded.')
            raise Exception('No response loaded.')
        record = self.api_response.next()
        last_count = 0
        while record:
            temp_xml = record.raw
            if isinstance(temp_xml, str):
                root = ElementTree.fromstring(temp_xml)
                self.data.append(root)

                download_count = len(root[2]) - 1
                last_count += download_count
                token = root[2][-1]
                total = 0
                file = None
                try:
                    file = open(self.path + self.base_file_name + '-' + token.text + '.xml', 'w',
                                encoding=self.encoding)
                    total = int(root[2][-1].get('completeListSize'))
                    self.logger.info('Downloaded %s records from repository. Still %s to go.',
                                     download_count, total - last_count)
                    file.write(temp_xml)
                    record = self.api_response.next()
                except TypeError:  # no resumption token found.
                    file = open(self.path + self.base_file_name + '-' + str(random.randrange(100000)) + '.xml', 'w',
                                encoding=self.encoding)
                    self.logger.info('No resumption token found. Stopping Download. '
                                     'Downloaded %s from this repository.', total)
                    file.write(temp_xml)
                    record = None
                except (BadArgument, BadResumptionToken) as error:
                    self.logger.critical('Stopped Download: "%s"', str(error))
                    record = None
                finally:
                    if file is not None:
                        file.close()
コード例 #20
0
from sickle import Sickle
from sickle.oaiexceptions import IdDoesNotExist

from constants import oai_url, oai_id
from downloader import download

sickle = Sickle(oai_url)
counter = 0
skip = 0
while True:
    counter += 1
    oaid = oai_id + str(counter)
    try:
        record = sickle.GetRecord(identifier=oaid, metadataPrefix='oai_dc')
        if download(str(counter), str(record)):
            skip = 0
        else:
            skip += 1
    except IdDoesNotExist:
        skip += 1
    if skip > 500:
        break
print("Finished")
コード例 #21
0
def do_import(max_num):

    x = 0
    bit = 100 / max_num

    print(' [*] connecting to database...')
    db = database.get_db()

    print(' [*] connecting to the OAI-PMH server...')
    sickle = Sickle('https://web.e.toscana.it/SebinaOpac/OAIHandler')

    print(' [*] fetching records with prefix `oai_dc` (Dublin Core)...')
    records = sickle.ListRecords(metadataPrefix='oai_dc')

    count = 0
    array = []
    places = []

    db.execute('delete from biblios')
    db.execute('delete from records')
    db.execute('delete from places')
    db.execute("delete from sqlite_sequence where name='records'")

    query = "INSERT INTO records(title, subject, creator, contributor, date, description, language, publisher, type, format, relation, published_in, link, biblio)" \
            "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
    query2 = "INSERT INTO places(id, name, coords) VALUES(?, ?, ?)"

    print(' [*] parsing first {} records...'.format(max_num))
    for record in records:
        if count < max_num:

            # Estrai luogo di pubblicazione dallo stesso record ma in formato Unimarc
            unimarc = sickle.GetRecord(identifier=record.header.identifier,
                                       metadataPrefix='oai_unimarc')
            luogo = luogo_pubblicazione(unimarc)

            place_id = None

            if luogo is not None:
                coords = get_coordinates(luogo)

                # Hardcoded coordinates
                if luogo == 'Accademia dei Georgofili':
                    coords = '43.7685119,11.255005'
                    place_id = '6569185'
                elif luogo == 'Massa':
                    coords = '44.033333,10.133333'
                    place_id = '875754'
                elif luogo == 'Castel S. Niccolò':
                    coords = '43.7192741,11.5975257'
                    place_id = '31541'
                elif luogo == 'Porcari':
                    coords = '43.8419546,10.6008321'
                    place_id = '32388'
                elif luogo == 'Cascina':
                    coords = '43.6877668,10.4729074'
                    place_id = '34342'
                elif luogo == 'San Vincenzo':
                    coords = '43.100134,10.540344'
                    place_id = '32495'
                elif luogo == 'Monte Oriolo, Impruneta':
                    coords = '43.70869,11.25515'
                    place_id = '18487140'

                if coords != '':
                    if place_id is None:
                        place_id = get_page_id(luogo)

                    if place_id is not None:
                        if not any(x[0] == place_id for x in places):
                            places.append((str(place_id), luogo, coords))

                        # Increment counters
                        count += 1
                        x += bit

                        # Inserisci nella query
                        d = record2dict(record, place_id)
                        array.append(
                            (d['title'].strip(), d['subject'], d['creator'],
                             d['contributor'], d['date'], d['description'],
                             d['language'], d['publisher'], d['type'],
                             d['format'], d['relation'], d['published_in'],
                             d['link'], d['biblio']))
                    else:
                        print('could not find page id for ' + luogo +
                              ', skipping...')
                else:
                    print('could not find coordinates for ' + luogo +
                          ', skipping...')

            desc = "Importing records... ({}/{})".format(count, max_num)
            yield "data: {}%%{}\n\n".format(str(x), desc)
        else:
            print(' [*] closing source...')
            yield "data: {}%%{}\n\n".format('100', 'done')
            break

    print(' [*] inserting saved records to the table...')

    # Inserisci nel db
    db.executemany(query, array)
    db.executemany(query2, places)
    db.commit()

    print(' [*] done!')