Ejemplo n.º 1
0
    def __init__(self, endpoint, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param endpoint: the address of the OAI-PMH endpoint
            to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.registry.registerReader('citeproc', citeproc_reader)
        self.client = Client(endpoint, self.registry)
        self.client._day_granularity = day_granularity
        if settings.PROAIXY_API_KEY:
            self.client.extra_parameters = {
                'key': settings.PROAIXY_API_KEY}
        self.translators = {}
Ejemplo n.º 2
0
def list_records(target, date_from, date_until, setspec):
    #logging.debug("list_records")
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        if records is not None:
            for record in records:
                yield convert_record(record, target['metadata_prefix'], target['title'])
Ejemplo n.º 3
0
 def __init__(self, url):
     registry = MetadataRegistry()
     registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader)
     self.client = Client(url, registry)
     self.institutes = {}
     self.resource_types = []
     self.load_institutes_and_types()
Ejemplo n.º 4
0
def init(user):
	fullURL = URL+user
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(fullURL, registry)
	logging.info('The community %s harvested', user)
	return(client)
Ejemplo n.º 5
0
def get_record(target, identifier):
    if target is not None:
        client = Client(target['url'], registry)
        record = client.getRecord(identifier=identifier,
                                  metadataPrefix=target['metadata_prefix'])
        return convert_record(record, target['metadata_prefix'],
                              target['title'])
Ejemplo n.º 6
0
    def __init__(self, oaisource, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param oaisource: the OAISource to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        if not oaisource.endpoint:
            raise ValueError(
                'No OAI endpoint was configured for this OAI source.')

        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.client = Client(oaisource.endpoint, self.registry)
        self.client._day_granularity = day_granularity
        self.translators = {
            'oai_dc': OAIDCTranslator(oaisource),
            'base_dc': BASEDCTranslator(oaisource),
        }
Ejemplo n.º 7
0
def harvest(url):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)

    client = Client(url, registry)
    client.ignoreBadCharacters(true_or_false=True)

    identifiers = []
    for header in client.listIdentifiers(metadataPrefix='oai_dc'):
        # if (not(header.isDeleted())):
        print(f"Found identifier {header.identifier()}")
        identifiers.append(header.identifier())
        # else:
        #     print(f"Skipping (DELETED) identifier {header.identifier()}")

    print(f"Total number of identifiers: {len(identifiers)}")

    # Only get the identifier string at the end of the url
    identifiers = [x.split('/')[-1] for x in identifiers]

    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, 'philarchive-2.txt')

    with open(filename, 'w') as f:
        print(f"Writing to {filename}")
        f.writelines('\n'.join(identifiers))
Ejemplo n.º 8
0
    def __init__(self,
                 url,
                 prefix=nsdl.LR_NSDL_PREFIX,
                 reader=None,
                 fields=None,
                 namespaces=None,
                 fieldMap=None):
        '''
        Constructor
        '''

        if fields == None:
            self._fields = nsdl.LR_NSDL_DC_FIELDS
        else:
            self._fields = fields

        if fieldMap == None:
            self._fieldMap = nsdl.NSDL_TO_LR_MAP
        else:
            self._fieldMap = fieldMap

        if namespaces == None:
            self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
        else:
            self._namespaces = namespaces

        if reader == None:
            reader = MetadataReader(fields=self._fields,
                                    namespaces=self._namespaces)

        self._url = url
        self._registry = MetadataRegistry()
        self._prefix = prefix
        self._registry.registerReader(prefix, reader)
        self._client = Client(url, self._registry)
Ejemplo n.º 9
0
def list_sets(target):
    if target is not None:
        client = Client(target['url'], registry)
        setspecs = client.listSets()
        results = []
        if setspecs is not None:
            for setspec in setspecs:
                results.append(convert_setspec(setspec))
        return results
Ejemplo n.º 10
0
def list_metadata_formats(target, identifier):
    if target is not None:
        client = Client(target['url'], registry)
        metadata_formats = client.listMetadataFormats(identifier=identifier)
        results = []
        if metadata_formats is not None:
            for metadata_format in metadata_formats:
                results.append(convert_metadata_formats(metadata_format))
        return results
Ejemplo n.º 11
0
def list_identifiers(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        results = []
        if headers is not None:
            for header in headers:
                results.append(convert_header(header))
        return results
Ejemplo n.º 12
0
    def __init__(self, configuration_file):
        """Constructor."""
        self.oai_config = ConfigParser.SafeConfigParser()
        self.oai_config.read(configuration_file)
        self.current_config = 'ToulouseBis'

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        self.client = Client(self._get_config_value('url'), registry)
Ejemplo n.º 13
0
def test(request):
	URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler'
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(URL, registry)
	identifyResponse = client.identify()

	print dir(identifyResponse)
	#for record in client.listRecords(metadataPrefix='oai_dc'):
	#	result += record
	return HttpResponse(identifyResponse.repositoryName())
Ejemplo n.º 14
0
Archivo: forms.py Proyecto: llcit/llt
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
Ejemplo n.º 15
0
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r, record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)
Ejemplo n.º 16
0
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start,
                                 until=end,
                                 set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1

        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({'handle': handle[0]})
        for key in qdc_reader._fields.keys():
            r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n')

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
Ejemplo n.º 17
0
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                    metadataPrefix=self._prefix, from_=from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
Ejemplo n.º 18
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 19
0
def get_client(url, transforms):
    transforms = fix_transforms(transforms)
    registry = MetadataRegistry()
    c = Client(url, registry)
    metadata = c.listMetadataFormats()
    metadata[0] = [
        'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd',
        'http://www.kulturarv.dk/fbb'
    ]
    namespaces = dict((x[0], x[2]) for x in metadata)
    fields = dict((transform['field'], ('textList', transform['path']))
                  for transform in transforms)
    namespace = metadata[0][0]
    print namespaces, fields
    registry.registerReader(
        namespace, MetadataReader(fields=fields, namespaces=namespaces))
    return c, namespace
Ejemplo n.º 20
0
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)
Ejemplo n.º 21
0
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
            'title': ('textList', 'oai_oi:oi/oi:title/text()'),
            'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
            'creator': ('textList', 'oai_oi:oi/oi:creator/text()'),
            'subject': ('textList', 'oai_oi:oi/oi:subject/text()'),
            'description': ('textList', 'oai_oi:oi/oi:description/text()'),
            'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
            'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'),
            'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
            'date': ('textList', 'oai_oi:oi/oi:date/text()'),
            'type': ('textList', 'oai_oi:oi/oi:type/text()'),
            'extent': ('textList', 'oai_oi:oi/oi:extend/text()'),
            'medium': ('textList', 'oai_oi:oi/oi:medium/text()'),
            'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'),
            'source': ('textList', 'oai_oi:oi/oi:source/text()'),
            'language': ('textList', 'oai_oi:oi/oi:language/text()'),
            'references': ('textList', 'oai_oi:oi/oi:references/text()'),
            'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'),
            'attributionName':
            ('textList', 'oai_oi:oi/oi:attributionName/text()'),
            'attributionURL':
            ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
            'license': ('textList', 'oai_oi:oi/oi:license/text()'),
            #Zitten er niet in
            #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
            #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
            #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
            #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),
        },
        namespaces={
            'oi': 'http://www.openbeelden.nl/oai/',
            'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms',
        })
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)

    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
Ejemplo n.º 22
0
    def checkProvider(self, url):
        """
         Check OAI-PMH provider. A valid Identity response, is considered
         is considered as provider online. An exception is considered provider offline 
         """

        try:
            client = Client(url)
            ident = client.identify()
            self.log.debug("Service at: " + url + " is responding")
            self.log.debug("RepositoryName is: " + ident.repositoryName())
            self.log.debug("BaseURL is: " + ident.baseURL())
            return True

        except Exception as e:
            self.log.error("Problem with server at: " + url + "\n")
            #,exc_info=True)
            return False
Ejemplo n.º 23
0
    def iter_items(self, partition):
        """ Partition is an OAI-PMH endpoint """

        # source = "oai:%s" % partition

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = Client(partition, registry)

        for record in client.listRecords(metadataPrefix='oai_dc'):
            header, metadata, _ = record

            if header.isDeleted():
                continue

            # _id = header.identifier()
            # date = header.datestamp()

            meta = metadata.getMap()

            # TODO: there are much validation and heuristics to be done here!

            # format0 = (meta.get("format") or [None])[0]
            # if not format0:
            #     continue

            # if format0 not in ("application/pdf", ):
            #     continue

            url0 = (meta.get("identifier") or [None])[0]

            if not url0:
                continue

            title0 = (meta.get("title") or [""])[0].encode("utf-8")
            desc0 = (meta.get("description") or [""])[0].encode("utf-8")

            # TODO: validate that the url0 is not on another domain?!
            yield url0, {}, "html", 2, """
                <html><head><title>%s</title></head><body>%s</body></html>
            """ % (title0, desc0)
Ejemplo n.º 24
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Ejemplo n.º 25
0
 def _initialise_client(self, url):
     registry = MetadataRegistry()
     registry.registerReader('oai_dc', oai_dc_reader)
     registry.registerReader('ore', oai_ore_reader)
     logging.info('Initialising OAI client with URL [%s]', url)
     return Client(url, registry)
Ejemplo n.º 26
0
def indexCollection(URL, url_base, metadata_prefix, collection, action):
    #pull data from OAI endpoint
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(URL, registry, force_http_get=True)

    harvested_data = []
    for record in client.listRecords(metadataPrefix=metadata_prefix,
                                     set=collection):
        if not record[0].isDeleted():
            fields = record[1].getMap()
            if fields['subject']:
                fields['subjects'] = fields['subject'][0].split(';')
                del fields['subject']
            fields['set'] = record[0].setSpec()
            identifier = record[0].identifier().split(':')[2]
            fields[
                'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/'
            harvested_data.append(fields)

    if action is 'reindex':
        es.indices.delete(index='digital_collection_recs', ignore=[400, 404])

        mapping = {
            "mappings": {
                "_doc": {
                    "properties": {
                        "title": {
                            "type": "text"
                        },
                        "creator": {
                            "type": "text"
                        },
                        "subjects": {
                            "type": "text"
                        },
                        "description": {
                            "type": "text"
                        },
                        "publisher": {
                            "type": "text"
                        },
                        "contributor": {
                            "type": "text"
                        },
                        "date": {
                            "type": "text"
                        },
                        "type": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "format": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "identifier": {
                            "type": "text"
                        },
                        "source": {
                            "type": "text"
                        },
                        "language": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "relation": {
                            "type": "text"
                        },
                        "coverage": {
                            "type": "text"
                        },
                        "rights": {
                            "type": "text"
                        },
                        "set": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "image_url_base": {
                            "type": "text"
                        }
                    }
                }
            }
        }
        es.indices.create(index='digital_collection_recs', body=mapping)

    helpers.bulk(es,
                 harvested_data,
                 index='digital_collection_recs',
                 doc_type='_doc')

    return "success"
Ejemplo n.º 27
0
    """Returns the PyMARC record from the OAI structure for MARC XML"""
    def __call__(self, element):
        print element[0][1].text
        handler = marcxml.XmlHandler()
        marcxml.parse_xml(StringIO(tostring(element[0])), handler)
        return handler.records[0]


marcxml_reader = MARCXMLReader()

# Defining of metadata Readers in the Registry

from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('marc21', marcxml_reader)

#### OAI-PMH Client processing

oai = Client('http://snape.mzk.cz/OAI-script', registry)

recs = oai.listRecords(metadataPrefix='marc21', set='MZK03')

for rec in recs:
    print rec[0].identifier()
    r = rec[1]  # Get pyMARC representation
    print r['856']
    print r['034']
    print r['008']
    print
Ejemplo n.º 28
0
def harvest(metadata_set, dest_folder, log_file, content_type,
            from_date, until_date):

    #############################
    # ### FILESYSTEM CHECKS ### #
    #############################
    try:
        if not os.path.isdir(dest_folder):
            os.makedirs(dest_folder)
        # Verify write permission inside the folder:
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create destination folder: %s" % dest_folder)

    try:
        test_path = os.path.join(dest_folder, '__test_permissions__')
        os.makedirs(test_path)
        os.rmdir(test_path)
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to use destination folder: %s" % dest_folder)

    try:
        log_handle = open(log_file, 'a+')
        log_handle.close()
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create log_file: %s" % log_file)

    #################################
    # ### OAI-PMH CONFIGURATION ### #
    #################################
    URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do'
    metadata_prefix = 'efg'

    ###################################
    # ### OPEN OAI-PMH CONNECTION ### #
    ###################################
    registry = MetadataRegistry()
    registry.registerReader(metadata_prefix, oai_dc_reader)

    #print ("URL=" + str(URL))

    client = Client(URL, registry)

    ####################################
    # ### CHECK IF THIS SET EXISTS ### #
    ####################################
    set_found = False
    for s in client.listSets():
        if metadata_set == s[0]:
            set_found = True

    if not set_found:
        log.exit("Unable to find this set: %s" % metadata_set)

    #############################
    # ### RETRIEVE METADATA ### #
    #############################

    if from_date is not None:
        from_date = parse_date(from_date)
        if from_date is None:
            log.exit("Unable to convert from date")

    if until_date is not None:
        until_date = parse_date(until_date)
        if until_date is None:
            log.exit("Unable to convert until date")

    report_data = {
        'downloaded': 0,
        'filtered': 0,
        'saved': 0,
        'saved_files': [],
        'missing_sourceid': [],
        'wrong_content_type': []
    }
    timestamp = int(1000 * time.time())
    log.info("Retrieving records for %s..." % metadata_set)
    try:
        records = client.listRecords(
            metadataPrefix=metadata_prefix,
            set=metadata_set,
            from_=from_date,
            until=until_date)
    except NoRecordsMatchError as e:
        log.exit(e)

    log.info("Records retrieved, extracting...")
    try:

        for record in records:
            element = record[1].element()
            # Obtained eTree is based on namespaced XML
            # Read: 19.7.1.6. Parsing XML with Namespaces
            # https://docs.python.org/2/library/xml.etree.elementtree.html

            # find(match)
            # Finds the first subelement matching match.
            #   match may be a tag name or path.
            #   Returns an element instance or None.

            # findall(match)
            # Finds all matching subelements, by tag name or path.
            #   Returns a list containing all matching elements
            #   in document order.

            report_data['downloaded'] += 1

            if report_data['downloaded'] % 100 == 0:
                print('.', end='', flush=True)

                if report_data['downloaded'] % 5000 == 0:
                    print(
                        ' %s downloaded - %s saved' % (
                            report_data['downloaded'],
                            report_data['saved']
                        ), flush=True)

            efgEntity = element.find(tag("efgEntity"))
            if efgEntity is None:
                # log.warning("efgEntity not found, skipping record")
                continue
            avcreation = efgEntity.find(tag("avcreation"))
            nonavcreation = efgEntity.find(tag("nonavcreation"))

            if avcreation is not None:
                manifestation = avcreation.find(tag("avManifestation"))
                recordSource = avcreation.find(tag("recordSource"))
                keywords = avcreation.findall(tag("keywords"))
                title_el = avcreation.find(tag("identifyingTitle"))
                title = (title_el.text
                         if title_el is not None
                         else "Unknown title")
            elif nonavcreation is not None:
                manifestation = nonavcreation.find(tag("nonAVManifestation"))
                recordSource = nonavcreation.find(tag("recordSource"))
                keywords = nonavcreation.findall(tag("keywords"))
                title_el = nonavcreation.find(tag("title"))
                title = (title_el.find(tag("text")).text
                         if title_el is not None
                         else "Unknown title")
            else:
                title = "Unknown title"
                # log.warning("(non)avcreation not found, skipping record")
                continue

            filter_keyword = "IMediaCities"
            is_good = False
            for keyword in keywords:
                term = keyword.find(tag("term"))
                if term.text == filter_keyword:
                    is_good = True
                    break

            if not is_good:
                continue

            report_data['filtered'] += 1

            if manifestation is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("avManifestation not found, skipping record")
                continue

            if content_type is not None:
                content_type = content_type.lower()

                item = manifestation.find(tag("item"))
                if item is None:
                    # missing <item> => type cannot be found
                    report_data['wrong_content_type'].append(title)
                    continue

                item_type = item.find(tag("type"))
                if item_type is None:
                    # missing <type>
                    report_data['wrong_content_type'].append(title)
                    continue

                if item_type.text.lower() != content_type:
                    # wrong type
                    report_data['wrong_content_type'].append(title)
                    continue



            # ATTENZIONE: il sourceID va preso dal recordSource che sta
            #              sotto avcreation/nonavcreation e NON sotto
            #               avManifestation/nonAVManifestation

            #recordSource = manifestation.find(tag("recordSource"))
            if recordSource is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("recordSource not found, skipping record")
                continue

            sourceID = recordSource.find(tag("sourceID"))
            if sourceID is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("sourceID not found, skipping record")
                continue

            content = etree.tostring(efgEntity, pretty_print=False)

            # id_text = urllib.parse.quote_plus(sourceID.text.strip())
            # replace non alpha-numeric characters with a dash
            id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip())
            # fine cinzia

            filename = "%s_%s_%s.xml" % (
                metadata_set,
                id_text,
                timestamp
            )
            filepath = os.path.join(dest_folder, filename)
            # with open(filepath, 'wb') as f:
            with codecs.open(filepath, 'wb', "utf-8") as f:
                f.write(content.decode('utf-8'))
            # OLD
            #with codecs.open(filepath, 'wb', "utf-8") as f:
            #    f.write(html.unescape(content.decode('utf-8')))

            report_data['saved'] += 1
            report_data['saved_files'].append(filename)

    except NoRecordsMatchError as e:
        log.warning("No more records after filtering?")
        log.warning(e)

        # ###################
        # Write report file
        # ###################

        # the procedure writes a report file containing the results
        #     of the harvesting:
        # the list of records that do not contain the record ID
        #     (by writing the content of the element title)

    with open(log_file, 'w+') as f:
        json.dump(report_data, f)

    f.close()

    # Just to close previous dot line
    print("")

    log.info("""

%s records from set [%s] downloaded
open log file [%s] for details
""" % (report_data['saved'], metadata_set, log_file)
    )
def get_names(dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"
    # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc)

    edm_reader = MetadataReader(
        fields={
            'title': ('textList', record_prefix + '/dc:title/text()'),
            'creator': ('textList', record_prefix + '/dc:creator/text()'),
            'subject': ('textList', record_prefix + '/dc:subject/text()'),
            'description':
            ('textList', record_prefix + '/dc:description/text()'),
            'publisher': ('textList', record_prefix + '/dc:publisher/text()'),
            'contributor':
            ('textList', record_prefix + '/dc:contributor/text()'),
            'date': ('textList', record_prefix + '/dc:date/text()'),
            'type': ('textList', record_prefix + '/dc:type/text()'),
            'format': ('textList', record_prefix + '/dc:format/text()'),
            'identifier':
            ('textList', record_prefix + '/dc:identifier/text()'),
            'source': ('textList', record_prefix + '/dc:source/text()'),
            'language': ('textList', record_prefix + '/dc:language/text()'),
            'relation': ('textList', record_prefix + '/dc:relation/text()'),
            'coverage': ('textList', record_prefix + '/dc:coverage/text()'),
            'rights': ('textList', record_prefix + '/dc:rights/text()'),
            'spatial': ('textList', record_prefix + '/dc:spatial/text()'),
            'objectId': ('textList', record_prefix + '/@rdf:about'),
        },
        namespaces={
            'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms/',
            'dct': 'http://purl.org/dc/terms/',
            'edm': 'http://www.europeana.eu/schemas/edm/',
            'foaf': 'http://xmlns.com/foaf/0.1/',
            'owl': 'http://www.w3.org/2002/07/owl#',
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
            'skos': 'http://www.w3.org/2004/02/skos/core#',
            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
            'ore': 'http://www.openarchives.org/ore/terms/'
        })

    names = []
    identifier = []

    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry)
        # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21'

        for record in client.listRecords(metadataPrefix='edm', set=dataname):
            output = record[1].getMap()

            if output['creator'] != []:

                names.append([output['creator'][0]])
                identifier.append(
                    [output['creator'][0], output['objectId'][0]])

            if output['contributor'] != []:

                names.append([output['contributor'][0]])
                identifier.append(
                    [output['contributor'][0], output['objectId'][0]])

    print(names)

    return identifier
Ejemplo n.º 30
0
#from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from oai2es.oaipmh_harvester import MetadataRegistry, mods_reader, didl_reader, oai_dc_reader

URL = sys.argv[1]
METADATA_PREFIX = sys.argv[2]
if len(sys.argv) == 4:
    SETSPEC = sys.argv[3]
else:
    SETSPEC = None

registry = MetadataRegistry()
registry.registerReader('mods', mods_reader)
#registry.registerReader('didl', didl_reader)
#registry.registerReader('oac_dc', oai_dc_reader)

client = Client(URL, registry)

record_count = 0
deleted_count = 0

if SETSPEC:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC)
else:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX)

for num, record in enumerate(records):
    record_count += 1
    delinfo = ''
    if record[0].isDeleted():
        deleted_count += 1
        delinfo = '(deleted)'