Example #1
0
def list_records(target, date_from, date_until, setspec):
    #logging.debug("list_records")
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        if records is not None:
            for record in records:
                yield convert_record(record, target['metadata_prefix'], target['title'])
Example #2
0
def list_records(target, date_from, date_until, setspec):
    logging.debug("list_records")
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        if records is not None:
            for record in records:
                yield convert_record(record, target['metadata_prefix'], target['title'])
def list_records(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        results = []
        if records is not None:
            results = []
            for record in records:
                results.append(convert_record(record, target['metadata_prefix'], target['title']))
        return results
Example #4
0
    def scrape(self):
        raise Exception("not finished")
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        url = self.setting('pmh-endpoint')
        client = Client(url, registry)

        print "  OAI Repository", url
        print "  Available sets:"
        for s in client.listSets():
            print "   ", s

        oai_set = self.setting('set')
        oai_from = self.setting('from')
        oai_until = self.setting('until')

        kwargs = {}

        if oai_set:
            kwargs['set'] = oai_set

        if oai_from is not None:
            date_args = [int(arg) for arg in oai_from.split("-")]
            kwargs['from_'] = datetime.datetime(*date_args)

        if oai_until is not None:
            date_args = [int(arg) for arg in oai_until.split("-")]
            kwargs['until'] = datetime.datetime(*date_args)

        records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]

        data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
        with open(data_filepath, 'wb') as f:
            print "  picking", len(records), "records"
            pickle.dump(records, f)
Example #5
0
def insertAll(time, time2):
    registry = MetadataRegistry()
    registry.registerReader('arXivRaw', arXivRaw_reader)
    client = Client(URL, registry)
    client.updateGranularity()
    list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
    errors = 0
    for a in list:
        #a = list.next()
        try:
            title = '\n'.join(a[1]['title'])
            sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
            abstract = '\n'.join(a[1]['abstract'])
            url = 'http://arxiv.org/abs/' + a[1]['id'][0]
            date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
            authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
            abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
            print title
            print sr2
            print abstract
            print url
            print date
            print authors
            insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
        except:
            print 'ERROR'
            print a
            errors = errors+1
    print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0):

    base_url = "http://export.arxiv.org/oai2"
    output = list()

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)

    for _, md, _ in records:

        # print md.getField("title")
        # checks for the case in 2010 when there is no title for something
        if md is not None:

            txt_dict = {"title": md["title"],
                    "abstract": md["description"],
                    "date": md["date"],
                    "subject": md["subject"],
                    "url": md["identifier"],
                    "authors": md['creator']}

            output.append(txt_dict)

        time.sleep(sleep_time)

    return output
Example #7
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, metadata_registry)
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
def pull_data(source):
    list_of_records = []
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(source
                    + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry)
    try:
        exps_date = []
        exps_metadata = []
        for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc'):
            exps_date.append(str(header._datestamp))
            exps_metadata.append(meta)
            logger.debug('Date=%s' % header._datestamp)

    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    exp_counter = 0
    for exp_metadata in exps_metadata:
        user_id = exp_metadata.getField('creator')[0]
        user_profile = json.loads(_get_user(source, user_id))
        data_tobe_indexed = dict(user_profile)
        data_tobe_indexed['user_id'] = user_id

        exp_id = exp_metadata.getField('identifier')[0]
        description = exp_metadata.getField('description')[0]
        title = exp_metadata.getField('title')[0]
        if settings.EXPERIMENT_PATH[0] == '/':
            settings.EXPERIMENT_PATH = settings.EXPERIMENT_PATH[1:]
        experiment_url = os.path.join(source,
                                      settings.EXPERIMENT_PATH % exp_id)

        data_tobe_indexed['experiment_id'] = exp_id
        data_tobe_indexed['experiment_title'] = title
        data_tobe_indexed['experiment_description'] = description
        data_tobe_indexed['experiment_url'] = experiment_url
        data_tobe_indexed['id'] = experiment_url
        data_tobe_indexed['experiment_date'] = exps_date[exp_counter]
        exp_counter += 1
        for k, v in data_tobe_indexed.items():
            logger.debug('%s = %s' % (k, v))
        logger.debug('')
        list_of_records.append(json.dumps(data_tobe_indexed))

    return list_of_records
Example #9
0
    def harvest_oai_collection_records(self, collection):
        records = []
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(collection.community.repository.base_url, registry)
            records = client.listRecords(
                metadataPrefix='oai_dc', set=collection.identifier)
        except:
            return

        return records
Example #10
0
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r, record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)
Example #11
0
 def read_base_records(self):
     registry = MetadataRegistry()
     registry.registerReader('base_dc', base_dc_reader)
     client = Client('http://doai.io/oai', registry)
     for header, record, _ in client.listRecords(metadataPrefix='base_dc'):
         # only process records for which base was unsure
         if '2' not in record['oa']:
             continue
         # extract splash_url
         for link in record['identifier']:
             metadata = {'base_oa':''.join(record['oa']),
                     'splash_url':link,
                     'from_identifier':header.identifier()}
             yield self.filter_url(link,metadata, looking_for='any')
Example #12
0
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r,record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)     
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)                 
Example #13
0
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start,
                                 until=end,
                                 set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1

        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({'handle': handle[0]})
        for key in qdc_reader._fields.keys():
            r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n')

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
Example #14
0
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                    metadataPrefix=self._prefix, from_=from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
Example #15
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """    
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start, until=end, set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1
        
        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({ 'handle' : handle[0] })
        for key in qdc_reader._fields.keys():
           r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n');

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
Example #17
0
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                metadataPrefix = self._prefix,
                from_ = from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
Example #18
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Example #19
0
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
            'title': ('textList', 'oai_oi:oi/oi:title/text()'),
            'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
            'creator': ('textList', 'oai_oi:oi/oi:creator/text()'),
            'subject': ('textList', 'oai_oi:oi/oi:subject/text()'),
            'description': ('textList', 'oai_oi:oi/oi:description/text()'),
            'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
            'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'),
            'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
            'date': ('textList', 'oai_oi:oi/oi:date/text()'),
            'type': ('textList', 'oai_oi:oi/oi:type/text()'),
            'extent': ('textList', 'oai_oi:oi/oi:extend/text()'),
            'medium': ('textList', 'oai_oi:oi/oi:medium/text()'),
            'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'),
            'source': ('textList', 'oai_oi:oi/oi:source/text()'),
            'language': ('textList', 'oai_oi:oi/oi:language/text()'),
            'references': ('textList', 'oai_oi:oi/oi:references/text()'),
            'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'),
            'attributionName':
            ('textList', 'oai_oi:oi/oi:attributionName/text()'),
            'attributionURL':
            ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
            'license': ('textList', 'oai_oi:oi/oi:license/text()'),
            #Zitten er niet in
            #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
            #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
            #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
            #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),
        },
        namespaces={
            'oi': 'http://www.openbeelden.nl/oai/',
            'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms',
        })
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)

    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
        'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
        'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
        'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
        'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
        'description': ('textList', 'oai_oi:oi/oi:description/text()'),
        'abstract':       ('textList', 'oai_oi:oi/oi:abstract/text()'),
        'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
        'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
        'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
        'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
        'extent':      ('textList', 'oai_oi:oi/oi:extend/text()'),
        'medium':       ('textList', 'oai_oi:oi/oi:medium/text()'),
        'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
        'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
        'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
        'references':  ('textList', 'oai_oi:oi/oi:references/text()'),
        'spatial':  ('textList', 'oai_oi:oi/oi:spatial/text()'),
        'attributionName':       ('textList', 'oai_oi:oi/oi:attributionName/text()'),
        'attributionURL':       ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
        'license':       ('textList', 'oai_oi:oi/oi:license/text()'),
        #Zitten er niet in
        #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
        #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
        #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
        #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),        
        },
        namespaces={
            'oi' : 'http://www.openbeelden.nl/oai/',
            'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc' : 'http://purl.org/dc/elements/1.1/',
            'dcterms' : 'http://purl.org/dc/terms',
        }
    )
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)
    
    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
Example #21
0
    def iter_items(self, partition):
        """ Partition is an OAI-PMH endpoint """

        # source = "oai:%s" % partition

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = Client(partition, registry)

        for record in client.listRecords(metadataPrefix='oai_dc'):
            header, metadata, _ = record

            if header.isDeleted():
                continue

            # _id = header.identifier()
            # date = header.datestamp()

            meta = metadata.getMap()

            # TODO: there are much validation and heuristics to be done here!

            # format0 = (meta.get("format") or [None])[0]
            # if not format0:
            #     continue

            # if format0 not in ("application/pdf", ):
            #     continue

            url0 = (meta.get("identifier") or [None])[0]

            if not url0:
                continue

            title0 = (meta.get("title") or [""])[0].encode("utf-8")
            desc0 = (meta.get("description") or [""])[0].encode("utf-8")

            # TODO: validate that the url0 is not on another domain?!
            yield url0, {}, "html", 2, """
                <html><head><title>%s</title></head><body>%s</body></html>
            """ % (title0, desc0)
Example #22
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Example #23
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
def acquire_and_publish_documents(oai_url, publish_url, reader, prefix, pwd):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(oai_url, registry)
    documents = []
    count = 0
    for record in client.listRecords(metadataPrefix=prefix):
	    header = record[0]
	    metadata = record[1]
	    rawMetadata = urllib2.urlopen("{0}?verb=GetRecord&metadataPrefix={1}&identifier={2}".format(oai_url,prefix,header.identifier())).read()

            # re-format Jorum id
            identifier = header.identifier()
            identifier = identifier.replace("oai:dspace.jorum.ac.uk:","")
            uri = "http://dspace.jorum.ac.uk/xmlui/handle/" + identifier
            print(uri)

            # create keys from dc.subject terms
            fo = StringIO.StringIO(rawMetadata)
            tree = parse(fo)  # can only parse files or file objects
            keys = []
            for elem in tree.getiterator():
#                print("tag  " + str(elem.tag))
#                print("text " + str(elem.text))
                if elem.tag == "{http://purl.org/dc/elements/1.1/}subject":
                    keys.append(elem.text)
            fo.close()
            print(keys)
            print("\n")
	    value = convert_to_envelope(metadata, rawMetadata, uri, keys)
#            print (value)
#            print(dir(header))
	    if value != None:
		    documents.append(value)
		    count += 1
		    if (count % 10 == 0) or (count == 3):
			publish_documents(publish_url, documents, pwd)
			documents = []
    publish_documents(publish_url, documents, pwd)
Example #25
0
#try this and see if it works; if it does resumption tokens right, this should work fine.


chunk = timedelta(days=1)
oneday = timedelta(days=1)

#TODO: clearly they don't do this whole "ordered" thing. Grab records by month or year or something instead of all at once.
#TODO: luckily, once we've done a full slurp, we only need to remember when the last full slurp was and start since then. But if interrupted, we need to start back from where the last *full* slurp was, due to the ordering problem.

#TODO: structure this better, with the try effectively moved much further above. Really, move a lot more into functions
try:
    current = start #TODO: make a nice little generator so I can use a for loop
    while current <= datetime.now():
        print >>sys.stderr, "fetching records @", now(), "starting with", current.strftime('%Y-%m-%d')
        try:
            records = client.listRecords(metadataPrefix='oai_dc', from_=current, until=(current + chunk))
        except NoRecordsMatchError:
            print >>sys.stderr, "no records for this chunk, continuing to next"
            current += chunk
            store.write_last(current)
            continue
        print >>sys.stderr, "record fetch finished @", now()
        for index, (header, metadata, _) in enumerate(records, start=1):
            store.write_record(header, metadata)
            if index == 1 or index % 1000 == 0:
                print >>sys.stderr, "  wrote record", index, "of", header.datestamp().strftime('%Y-%m-%d'), "with id", header.identifier()
        current += chunk
        store.write_last(current)
finally:
    print >>sys.stderr, "closing store"
    store.close()
class NSDLDCImport(object):
    '''
    Class exports the required fields from the UCAR OAI-PMH data repository using NSDL_DC.
    '''


    def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None):
        '''
        Constructor
        '''
        
        if fields == None:
            self._fields = nsdl.LR_NSDL_DC_FIELDS
        else:
            self._fields = fields
        
        if fieldMap == None:
            self._fieldMap = nsdl.NSDL_TO_LR_MAP
        else:
            self._fieldMap = fieldMap
        
        if namespaces == None:
            self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
        else:
            self._namespaces = namespaces
            
        if reader == None:
            reader = MetadataReader(fields = self._fields, namespaces = self._namespaces)
        
        self._url = url
        self._registry = MetadataRegistry()
        self._prefix = prefix
        self._registry.registerReader(prefix, reader)
        self._client = Client(url, self._registry)
    
    def _format(self, doc):
        value = {}
        # merge all the fields
        for (fieldname, fieldconfig) in self._fieldMap.items():
            if fieldconfig["type"] == "const" and "const" in fieldconfig:
                value[fieldname] = fieldconfig["const"]
            elif fieldconfig["type"] == "[string]" and len(fieldconfig["fields"]) > 0:
                value[fieldname] = []
                for field in fieldconfig["fields"]:
                    value[fieldname].extend(doc.getField(field))
            elif fieldconfig["type"] == "string" and len(fieldconfig["fields"]) > 0:
                value[fieldname] = ""
                for field in fieldconfig["fields"]:
                    value[fieldname] += ", ".join(doc.getField(field))
            elif fieldconfig["type"] == "boolean" and len(fieldconfig["fields"]) > 0:
                value[fieldname] = True
                for field in fieldconfig["fields"]:
                    value[fieldname] &= doc.getField(field)
        return value
    
    def fetch_documents(self, range=10000):
        
        return_stuff = []
        for record in self._client.listRecords(metadataPrefix=self._prefix):
            r = record[1]
            value = self._format(r)
            if value != None:
                return_stuff.append(value)
            if len(return_stuff) >= range:
                yield return_stuff     
                return_stuff = []
Example #27
0
if os.path.exists(LANG_CACHE_FILE):
  lcf = codecs.open(LANG_CACHE_FILE, 'r', 'utf-8')
  for line in lcf:
    lang, text = line.rstrip("\r\n").split("\t")
    if lang == '': lang = None
    lang_cache[text] = lang
  lcf.close()  

label_to_uri = {}

# pass 1: convert MARC data to basic RDF

oai = Client('https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi', registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2019,05,15))
recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames')
for oaipmhrec in recs:
  convert_record(oaipmhrec)

recs = oai.listRecords(metadataPrefix='marc21', set='meetingNames')
for oaipmhrec in recs:
  convert_record(oaipmhrec)

# pass 2: convert literal values to resources

for prop in (relatedCorporateBody, predecessor, successor, hierarchicalSuperior):
  for s,o in g.subject_objects(prop):
    if isinstance(o, Literal):
      g.remove((s,prop,o)) # remove original
      res = label_to_uri.get(u"%s" % o, None)
      if res is None:
Example #28
0
    SETSPEC = sys.argv[3]
else:
    SETSPEC = None

registry = MetadataRegistry()
registry.registerReader('mods', mods_reader)
#registry.registerReader('didl', didl_reader)
#registry.registerReader('oac_dc', oai_dc_reader)

client = Client(URL, registry)

record_count = 0
deleted_count = 0

if SETSPEC:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC)
else:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX)

for num, record in enumerate(records):
    record_count += 1
    delinfo = ''
    if record[0].isDeleted():
        deleted_count += 1
        delinfo = '(deleted)'
    print '%0.6d %s %s' % (num, record[0].identifier(), delinfo)
    if record[1] is not None:
        # metadata = client.getMetadata(metadataPrefix='mods', identifier=record[0].identifier())
        # print type(metadata), metadata.tag
        print "MAP: ", record[1].getMap()
    # print '       %s' % ';'.join(record[0].setSpec())
Example #29
0
class Repository(object):
    """ Repository handles interaction with the various interfaces provided by 
    the dspace repository. """
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)

    def _extractIdentifierBase(self, url):
        """ From a given URL, extract the OAI identifier base (hostname) """
        return urlparse(url).hostname

    def _extractSet(self, handle):
        """ Determine the OAI set from a collection handle """
        if not isinstance(handle, basestring):
            raise ValueError('Collection handles must be strings')
        return 'hdl_' + handle.replace('/', '_').replace(':', '_')

    def getName(self):
        """ Get the configured name of the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return self.oai.identify().repositoryName()

    def getCollections(self):
        """ Get a list of the collections in the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return map(lambda c: c[0:2], self.oai.listSets())

    def getItemHandles(self, collection=None, **kw):
        """ Get item handles from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        for item in self.getItemIdentifiers(collection=collection, **kw):
            yield item.identifier().split(':', 2)[2]

    def getItemIdentifiers(self, collection=None, **kw):
        """ Get item identifiers from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listIdentifiers(**kw)

    def getItems(self, collection=None, **kw):
        """ Get full items from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listRecords(**kw)

    def getItem(self, handle=None, identifier=None, **kwargs):
        """ Get a single item from the OAI-PMH interface either by handle or 
        identifier """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kwargs.setdefault('metadataPrefix', 'mets')

        if handle is None and identifier is None:
            raise ValueError('Either handle or identifier must be provided')

        if handle is not None:
            if identifier is not None:
                raise ValueError('Either a handle or identifier must be '
                                 'provided, not both')

            identifier = 'oai:%s:%s' % (
                self.identifier_base,
                handle,
            )

        return self.oai.getRecord(identifier=identifier, **kwargs)

    def getOAIItemIdentifier(self, handle):
        return 'oai:%s:%s' % (self._extractIdentifierBase(
            self.base_url), handle)

    def getSwordCollections(self):
        pass

    def getSwordCollection(self, args):
        pass
Example #30
0
#!/usr/bin/env python

# uses http://www.infrae.com/download/OAI/pyoai
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

URL = 'http://researchcommons.waikato.ac.nz/dspace-oai/request'
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)


for record in client.listRecords(metadataPrefix='oai_dc'):
  print record[0].identifier()

Example #31
0
class ZoraAPI:
    METADATA_PREFIX = 'oai_dc'

    # In the constructor, we register to the ZORA API and initialize the necessary class variables
    def __init__(self, url):
        registry = MetadataRegistry()
        registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader)
        self.client = Client(url, registry)
        self.institutes = {}
        self.resource_types = []
        self.load_institutes_and_types()

    # Returns the hierarchical dictionary of institutes
    def get_institutes(self):
        return self.institutes

    # Returns the list of resource types
    def get_resource_types(self):
        return self.resource_types

    # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary.
    def load_institutes_and_types(self):
        institutes_list = []
        resource_type_list = []
        for item in self.client.listSets():
            split = item[1].split(' = ')
            if len(split) != 2:
                continue
            set_type, set_value = split
            if set_type == 'Subjects':
                institutes_list.append(set_value)
            elif set_type == 'Type':
                resource_type_list.append(set_value)
        institutes_dict = self.parse_institutes(institutes_list)
        self.institutes = institutes_dict
        self.resource_types = resource_type_list

    # Parses a list of institutes into a hierarchical dictionary
    @staticmethod
    def parse_institutes(institute_list_raw):
        institutes_dict = {}
        for institute_raw in institute_list_raw:
            institutes = institute_raw.split(': ')
            parent = institutes_dict
            for institute in institutes:
                if parent.get(institute) is None:
                    parent[institute] = {}
                parent = parent[institute]
        return institutes_dict

    # Get all metadata dictionaries from ZORA
    def get_metadata_dicts(self, from_):
        record_list = self.get_records(from_)
        metadata_dict_list = self.parse_records(record_list)
        return metadata_dict_list

    # Gets one specific paper from the ZORA repository and returns the record of it
    def get_record(self, uid):
        record = self.client.getRecord(identifier=uid,
                                       metadataPrefix=ZoraAPI.METADATA_PREFIX)
        return record

    # Gets the papers from the ZORA repository and returns their records in form of a list
    def get_records(self, from_):
        args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX}

        # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes)
        if from_:
            args['from_'] = from_

        # Get the relevant papers from ZORA and parse them
        record_list = []
        try:
            print('Loading records from ZORA API...')
            record_iterator = self.client.listRecords(**args)
            record_list = []
            count = 0
            for record in record_iterator:
                record_list.append(record)
                count += 1
                if is_debug() and count % 1000 == 0:
                    print(str(count))
            print(count)
            print('Done')
        except NoRecordsMatchError:
            print('No records were found')
        except RemoteDisconnected as error:
            print(error)
        except Exception as error:
            print(error)
        finally:
            return record_list

    # This method parses a list of records from ZORA in a easier to use metadata dictionary.
    def parse_records(self, record_list):
        metadata_dict_list = []
        print('Parsing records...')
        for record in record_list:
            metadata_dict = self.parse_record(record)
            if metadata_dict:
                metadata_dict_list.append(metadata_dict)
        print('Done')
        return metadata_dict_list

    # This function parses a record into a dictionary with a similar structure of the Paper database object.
    # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey
    # decimal classifications), 'keywords' and 'institutes'.
    #
    # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between
    # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords).
    @staticmethod
    def parse_record(record):
        metadata_dict = {}
        metadata_dict['uid'] = record[0].identifier()

        # If there is no metadata, we assume that the paper has been deleted and store that information in the dict
        if not record[1]:
            metadata_dict['deleted'] = True
            return metadata_dict

        # If there is metadata available, we parse it into a convenient form
        metadata_dict = {**metadata_dict, **dict(record[1].getMap())}

        metadata_dict['title'] = metadata_dict['title'][
            0] if 'title' in metadata_dict and len(
                metadata_dict['title']) > 0 else None
        metadata_dict['creators'] = metadata_dict.pop(
            'creator') if 'creator' in metadata_dict else []

        # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a
        # comma-separated list, it is a list of keywords. Otherwise it is an institute.
        #
        # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three
        # digits before we look for comma separated lists. Some institutes contain commas as well. This
        # leads to some institutes getting recognized as a list of keywords. With the information available this problem
        # unfortunately cannot be solved properly.
        institute_list = []
        ddc_list = []
        keyword_list = []
        if 'subject' in metadata_dict:
            for item in metadata_dict['subject']:

                # If subject starts with three digits and a space, we assume its a dewey decimal classification
                regex = re.compile('^\d\d\d\s+\w')
                if regex.match(item):
                    ddc_list.append(item)

                # If the subject has the same name as an institute, we assume it is an institute
                elif db.session.query(Institute).filter(
                        Institute.name == item).first():
                    institute_list.append(item)

                # If it is none of the above, we assume that it is a comma-separated list of keywords
                else:
                    for keyword in item.split(','):
                        keyword_list.append(keyword)

        metadata_dict['institutes'] = institute_list
        metadata_dict['ddcs'] = ddc_list
        metadata_dict['keywords'] = keyword_list
        metadata_dict['description'] = metadata_dict['description'][
            0] if 'description' in metadata_dict and len(
                metadata_dict['description']) > 0 else None
        metadata_dict['publisher'] = metadata_dict['publisher'][
            0] if 'publisher' in metadata_dict and len(
                metadata_dict['publisher']) > 0 else None
        metadata_dict['date'] = metadata_dict['date'][
            0] if 'date' in metadata_dict and len(
                metadata_dict['date']) > 0 else None

        # We filter the 'type' field and only store the paper type
        type_list = metadata_dict.pop(
            'type') if 'type' in metadata_dict else []
        resource_type_list = []
        for resource_type in type_list:
            if db.session.query(ResourceType).filter(
                    ResourceType.name == resource_type).first():
                resource_type_list.append(resource_type)
        metadata_dict['resource_types'] = resource_type_list
        metadata_dict['language'] = metadata_dict['language'][
            0] if 'language' in metadata_dict and len(
                metadata_dict['language']) > 0 else None
        metadata_dict['relation'] = metadata_dict['relation'][
            0] if 'relation' in metadata_dict and len(
                metadata_dict['relation']) > 0 else None

        return metadata_dict
Example #32
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from urllib2 import Request, urlopen
from StringIO import StringIO

# url = 'http://export.arxiv.org/api/query?search_query=all:cs&start=0&max_results=10'
# data = urllib.urlopen(url).read()
# urlpdf = 'http://arxiv.org/pdf/1510.02262v1.pdf'
# remoteFile = urlopen(Request(urlpdf)).read()
# memoryFile = StringIO(remoteFile)
# pdfFile = pyPdf.PdfFileReader(memoryFile)
# data = pyPdf.PdfFileReader(file('http://arxiv.org/pdf/1510.02262v1.pdf','r'))

# singles = [stemmer.stem(plural) for plural in plurals]

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

registry = MetadataRegistry()
URL = "http://export.arxiv.org/oai2"
registry.registerReader("oai_dc", oai_dc_reader)

clt = Client(URL, registry)
ic = 0
for record in clt.listRecords(metadataPrefix="oai_dc"):
    if ic > 10:
        break
    print record[1]["title"][0]
    record[1]["identifier"][0]  # arxiv_id link
    ic += 1
Example #33
0
class OaiPaperSource(PaperSource):  # TODO: this should not inherit from PaperSource
    """
    A paper source that fetches records from the OAI-PMH proxy
    (typically: proaixy).

    It uses the ListRecord verb to fetch records from the OAI-PMH
    source. Each record is then converted to a :class:`BarePaper`
    by an :class:`OaiTranslator` that handles the format
    the metadata is served in.
    """

    def __init__(self, endpoint, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param endpoint: the address of the OAI-PMH endpoint
            to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.registry.registerReader('citeproc', citeproc_reader)
        self.client = Client(endpoint, self.registry)
        self.client._day_granularity = day_granularity
        if settings.PROAIXY_API_KEY:
            self.client.extra_parameters = {
                'key': settings.PROAIXY_API_KEY}
        self.translators = {}

    # Translator management

    def add_translator(self, translator):
        """
        Adds the given translator to the paper source,
        so that we know how to translate papers in the given format.

        The paper source cannot hold more than one translator
        per OAI format (it decides what translator to use
        solely based on the format) so if there is already a translator
        for that format, it will be overriden.
        """
        self.translators[translator.format()] = translator

    # Record ingestion

    def ingest(self, from_date=None, metadataPrefix='any',
               resumptionToken=None):
        """
        Main method to fill Dissemin with papers!

        :param from_date: only fetch papers modified after that date in
                          the proxy (useful for incremental fetching)
        :param metadataPrefix: restrict the ingest for this metadata
                          format
        """
	args = {'metadataPrefix':metadataPrefix}
	if from_date:
	    args['from_'] = from_date
	if resumptionToken:
	    args['resumptionToken'] = resumptionToken
        records = self.client.listRecords(**args)
        self.process_records(records)

    def create_paper_by_identifier(self, identifier, metadataPrefix):
        """
        Queries the OAI-PMH proxy for a single paper.

        :param identifier: the OAI identifier to fetch
        :param metadataPrefix: the format to use (a translator
                    has to be registered for that format, otherwise
                    we return None with a warning message)
        :returns: a Paper or None
        """
        record = self.client.getRecord(
                    metadataPrefix=metadataPrefix,
                    identifier=identifier)
        return self.process_record(record[0], record[1]._map)

    # Record search utilities

    def listRecords_or_empty(self, source, *args, **kwargs):
        """
        pyoai raises :class:`NoRecordsMatchError` when no records match,
        we would rather like to get an empty list in that case.
        """
        try:
            return source.listRecords(*args, **kwargs)
        except NoRecordsMatchError:
            return []

    def process_record(self, header, metadata):
        """
        Saves the record given by the header and metadata (as returned by
        pyoai) into a Paper, or None if anything failed.
        """
        translator = self.translators.get(header.format())
        if translator is None:
            print("Warning: unknown metadata format %s, skipping" %
                  header.format())
            return

        paper = translator.translate(header, metadata)
        if paper is not None:
            try:
                with transaction.atomic():
                    saved = Paper.from_bare(paper)
                return saved
            except ValueError as e:
                print "Ignoring invalid paper:"
                print header.identifier()
                print e

    def process_records(self, listRecords):
        """
        Save as :class:`Paper` all the records contained in this list
        """
        # check that we have at least one translator, otherwise
        # it's not really worth trying…
        if not self.translators:
            raise ValueError("No OAI translators have been set up: " +
                             "We cannot save any record.")

        last_report = datetime.now()
        processed_since_report = 0

        for record in listRecords:
            header = record[0]
            metadata = record[1]._map

            self.process_record(header, metadata)

            # rate reporting
            processed_since_report += 1
            if processed_since_report >= 1000:
                td = datetime.now() - last_report
                rate = 'infty'
                if td.seconds:
                    rate = unicode(processed_since_report / td.seconds)
                print("current rate: %s records/s" % rate)
                processed_since_report = 0
                last_report = datetime.now()
Example #34
0
#-------------------------------------------------------------------------------
# Name:        module1
# Purpose:
#
# Author:      dd
#
# Created:     06/05/2014
# Copyright:   (c) dd 2014
# Licence:     <your licence>
#-------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import print_function
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

URL = ' http://www.pubmedcentral.nih.gov/oai/oai.cgi'

bla = "set=pmc-open"

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)

for record in client.listRecords(metadataPrefix='oai_dc', set='pmc-open'):
    print(record)
registry = MetadataRegistry()
registry.registerReader('marc21', MarcXML)
client = Client(URL, registry)

start = valid_date(start_date)
stop = valid_date(stop_date)

# main

while start < stop:
    from_date = start
    start = start + timedelta(days=1)  # increase days one by one
    until_date = start
    try:
        records = client.listRecords(metadataPrefix='marc21',
                                     set='SKC',
                                     from_=from_date,
                                     until=until_date)
        saverecords(records)
    except:
        pass  # skipping deleted entries

print('Done.')

#%% processing mrc to df

mrc_to_mrk('C:/Users/User/Desktop/nkp_nkc_2021-04-07.marc',
           'C:/Users/User/Desktop/nkp_nkc_2021-04-07.mrk')

fiction_types = ['1', 'd', 'f', 'h', 'j', 'p', 'u', '|', '\\']

filter_fiction_type = get_bool('Filter with a fiction type? ')
Example #36
0
else:
    SETSPEC = None
                       


registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
registry.registerReader(METADATA_PREFIX, oai_dc_reader)

client = Client(URL, registry)

record_count = 0
deleted_count = 0

if SETSPEC:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC)
else:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX)

for num, record in enumerate(records):
    record_count += 1
    delinfo = ''
    if record[0].isDeleted():
        deleted_count += 1
        delinfo = '(deleted)'
    print '%0.6d %s %s' % (num, record[0].identifier(), delinfo)
    print '       %s' % ';'.join(record[0].setSpec())

print 'Harvested %s records, of which %s were deleted' % (record_count,
                                                          deleted_count)
    
Example #37
0
    for line in lcf:
        lang, text = line.rstrip("\r\n").split("\t")
        if lang == '': lang = None
        lang_cache[text] = lang
    lcf.close()

label_to_uri = {}

# pass 1: convert MARC data to basic RDF

oai = Client(
    'https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi',
    registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2019,05,15))
recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames')
for oaipmhrec in recs:
    convert_record(oaipmhrec)

recs = oai.listRecords(metadataPrefix='marc21', set='meetingNames')
for oaipmhrec in recs:
    convert_record(oaipmhrec)

# pass 2: convert literal values to resources

for prop in (relatedCorporateBody, predecessor, successor,
             hierarchicalSuperior):
    for s, o in g.subject_objects(prop):
        if isinstance(o, Literal):
            g.remove((s, prop, o))  # remove original
            res = label_to_uri.get(u"%s" % o, None)
Example #38
0
#aka oaijson
import sys

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

import simplejson as json

import couchdb

server = couchdb.Server()
db = server['dcat']

URL = 'http://cardinalscholar.bsu.edu/cgi/oai2'

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)

records = client.listRecords(metadataPrefix='oai_dc')
i = 0
for hdr, metadata, _ in records:
    i = i + 1
    print hdr.identifier()
    print hdr.datestamp()
    map = metadata.getMap()
    map.update({'cdmcollection': 'cardinalscholar'})
    db.save(map)
    print 'saved ' + str(i)
def get_names (dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"

    edm_reader = MetadataReader(
        fields={

        'objectId':    ('textList', record_prefix + '/@rdf:about'),
        'spatial':     ('textList', record_prefix + '/dcterms:spatial/text()'),


        },
        namespaces={
    	   'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
    	   'dc':'http://purl.org/dc/elements/1.1/',
    	   'dcterms':'http://purl.org/dc/terms/',
    	   'dct': 'http://purl.org/dc/terms/',
    	   'edm' : 'http://www.europeana.eu/schemas/edm/',
    	   'foaf': 'http://xmlns.com/foaf/0.1/',
    	   'owl' : 'http://www.w3.org/2002/07/owl#',
    	   'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    	   'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    	   'skos': 'http://www.w3.org/2004/02/skos/core#',
    	   'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
    	   'ore': 'http://www.openarchives.org/ore/terms/'
    	}
         )


    dictnames={}
    identifier=[]


    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry )

        k = 0

        for record in client.listRecords(metadataPrefix='edm' , set= dataname ):

            output = record[1].getMap()

            k = k + 1
            print(k)

            if output['spatial'] !=[]:

                if len(output['spatial']) ==1:

                    if len(output['spatial'][0])>3:

                        if [output['spatial'][0],output['objectId'][0]] not in identifier:

                            identifier.append([output['spatial'][0],output['objectId'][0]])

                        if output['spatial'][0] not in dictnames.keys():

                            key = output['spatial'][0]
                            dictnames.setdefault(key,[])
                            dictnames[key].append(output['objectId'][0])

                        else:

                            key = output['spatial'][0]

                            dictnames[key].append(output['objectId'][0])

                else:
                    for j in range (0,len(output['spatial'])):

                        if len(output['spatial'][j])>3:

                            if [output['spatial'][j],output['objectId'][0]] not in identifier:

                                identifier.append([output['spatial'][j],output['objectId'][0]])

                            if output['spatial'][j] not in dictnames.keys():

                                key = output['spatial'][j]
                                dictnames.setdefault(key,[])
                                dictnames[key].append(output['objectId'][0])

                            else:

                                key = output['spatial'][j]

                                dictnames[key].append(output['objectId'][0])

    #print (identifier)


    return dictnames
Example #40
0
def main():
    #RDF graph initialization
    g = Graph()
    g.bind("dc", "http://purl.org/dc/elements/1.1/")
    g.bind("bibo", "http://purl.org/ontology/bibo/")
    g.bind("foaf", "http://xmlns.com/foaf/0.1/")
    g.bind("owl", "http://www.w3.org/2002/07/owl#")
    
    #OAI2 access initialization
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(HEDATUZ_URL, registry)
    
    creator_dict = {}
    creator_id_count = 1
    
    #print dir(client.listRecords)
    
    #Iterate over each record in headatuz database
    for record in client.listRecords(metadataPrefix='oai_dc'):
        for item in record:
            if type(item) == Metadata:
                item_dict = dict(item.getMap())
                ##print item_dict
                record_creator_list = []
                creator_list = item_dict['creator']
                #Get record identifier
                record_id_url = urlparse(item_dict['identifier'][0])
                record_id = record_id_url.path.replace('/', '')
                #Iterate over each creator of the current record
                for creator in creator_list:
                    creator_orig = creator                    
                    if creator_orig not in creator_dict.keys():
                        creator = creator.replace(' ', '%20')
                        creator_params = urllib.urlencode({'query': creator.encode('utf-8')})
                        req = urllib2.Request('http://viaf.org/viaf/AutoSuggest?' + creator_params)
                        f = urllib2.urlopen(req)
                        try:
                            json_item = simplejson.load(f, strict=False)
                        except Exception as e:
                            print e
                            break
                        #Generate creator id
                        #id_len = len(str(creator_id_count))
                        #digits = CREATOR_ID_DIGITS - id_len
                        #id_formatter = '%0' + str(digits) + 'd'
                        creator_id = creator_id_count
                        creator_id_count = creator_id_count + 1
                        
                        #Get results from VIAF (if any)
                        if json_item['result']:
                            viaf_id = json_item['result'][0]['viafid']
                            
                            #Create new Creator instance
                            creator = Creator(creator_orig, creator_id, viaf_id)
                        else:
                            #Create new Creator instance
                            creator = Creator(creator_orig, creator_id)
                        creator_dict[creator_orig] = creator
                        record_creator_list.append(creator)
                    else:
                        record_creator_list.append(creator_dict[creator_orig])
                
                item_dict['creator'] = record_creator_list
                item_type_list = item_dict['type']
                if type(item_type_list) == list:
                    for item_type in item_type_list:
                        if item_type.encode('utf-8') == 'Artículo':
                            #print 'Articulo'
                            g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Article'))
                        elif item_type.encode('utf-8') == 'Sección de Libro':
                            #print 'Seccion'
                            g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/BookSection'))
                        elif item_type == u'Libro':
                            #print 'Libro'
                            g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Book'))
                        elif item_type == u'PeerReviewed':
                            #print 'Peer'
                            g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/ontology/bibo/DocumentStatus', u'http://purl.org/ontology/bibo/status/peerReviewed'))
                        elif item_type.encode('utf-8') == 'Monografía':
                            g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Document'))
                

                else:
                    item_type = item_dict['type']
                    if item_type.encode('utf-8') == 'Artículo':
                        #print 'Articulo'
                        g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Article'))
                    elif item_type.encode('utf-8') == 'Sección de Libro':
                        #print 'Seccion'
                        g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/BookSection'))
                    elif item_type == u'Libro':
                        #print 'Libro'
                        g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Book'))
                    elif item_type == u'PeerReviewed':
                        #print 'Peer'
                        g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/ontology/bibo/DocumentStatus', u'http://purl.org/ontology/bibo/status/peerReviewed'))
                    elif item_type.encode('utf-8') == 'Monografía':
                        g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Document'))
                
                for key in item_dict:
                    obj = item_dict[key]
                    if type(obj) == list:
                        for creator_item in obj:
                            if key == 'creator':
                                g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/dc/elements/1.1/creator', RDF_DOMAIN + u'resource/author/' + str(creator_item.id)))
                            else:
                                g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/dc/elements/1.1/' + key, Literal(creator_item)))
                                
    for key in creator_dict.keys():
        creator = creator_dict[key]
        g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://xmlns.com/foaf/0.1/Person'))
        g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://xmlns.com/foaf/0.1/name', Literal(creator.name)))
        if creator.viaf_id != None:
            g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://www.w3.org/2002/07/owl#sameAs', VIAF_URL + creator.viaf_id))
                                
    print len(g)

    #for s, p, o in g:
        ##print s, p, o

    f = open('hedatuz.rdf', 'w')
    f.write(g.serialize(format='pretty-xml'))
    g.close()
    f.close()
Example #41
0
    """Put record in ElasticSearch"""
    es.index(index="hhs",
             doc_type="oai",
             id=record['id'],
             body={
                 "title": record['title'],
                 "url": getUrl(record['url']),
                 "genre": record['genre'],
                 "name": _getNames(record['name']),
                 "language": record['language'],
                 "topics": record['topics'],
                 "abstract": record['abstract'],
                 "date": datestamp,
             })


for record in client.listRecords(metadataPrefix='mods'):
    #print record
    if record[1] is not None:
        datestamp = record[0].datestamp()
        record = record[1].getMap()

        print datestamp, record
        #print {record['title']}, {record['url'][1]}, record['genre'], ', '.join(record['name']), record['language'], ', '.join(record['topics']), record['abstract']

        doc_url = getUrl(record['url'])
        if doc_url is not None:
            getThumb(doc_url)
        #esIndex(record, datestamp)
        #raw_input("Press Enter to continue...")
Example #42
0
            #Conecta com o provedor OAI-PMH
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(url_provider, registry)

            print("Conexão estabelecida")
            sets = client.listSets()  #lista os conjuntos
            print("Conjuntos encontrados")

            for setSpec, setName, setDescription in sets:  #percorre cada conjunto do provedor

                try:

                    records = client.listRecords(
                        metadataPrefix='oai_dc',
                        set=setSpec)  #lista os registros

                    print("Coletando dados do conjunto {}, do provedor {} \n".
                          format(setName, provider_name))

                    count = 1

                    for record in records:  #percorre os registros
                        header, metadata, about = record

                        if metadata:
                            #getMap return dictonary with all metadata fields
                            doc = metadata.getMap()

                            doc['_id'] = re.sub('[:/.]', '-',
Example #43
0
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
import json

URL = 'http://oai.narcis.nl/oai'  #?verb=GetRecord&metadataPrefix=oai_dc&identifier='
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)

for header, record, other in client.listRecords(metadataPrefix='oai_dc'):
    if not record:
        continue
    datarecord = record.getMap()
    datarecord['id'] = header.identifier()
    datarecord['datestamp'] = str(header.datestamp())
    print(json.dumps(datarecord))
    def retrieval(self, repository):
        self.logger.info(u'Trying to retrieve url {0}'.format(repository[1]).encode(ENCODE))

        registry = MetadataRegistry()
        registry.registerReader(METADATA, oai_dc_reader)

        try:
            client = Client(repository[1], registry)

            self.logger.info(SEPARATOR)
            self.logger.info(u'Connection established successfully...')

            # identify info
            identify = client.identify()
            repository_name = identify.repositoryName()
            repository_name_normalized = re.sub(re.compile(FILE_ESCAPE_CHARS), '', repository_name).strip() \
                .replace(' ', '_').lower()
            base_url = identify.baseURL().encode(ENCODE)
            protocol_version = identify.protocolVersion().encode(ENCODE)
            granularity = identify.granularity().encode(ENCODE)
            compression = identify.compression()
            deleted_record = identify.deletedRecord().encode(ENCODE)

            metadata = {'repository_name': repository_name,
                        'base_url': base_url,
                        'latest_url': repository[1],
                        'protocol_version': protocol_version,
                        'granularity': granularity,
                        'compression': str(compression).strip('[]'),
                        'deleted_record': deleted_record}

            self.logger.info(u'Repository name: {0}'.format(repository_name))
            self.logger.info(u'URL connected: {0}'.format(repository[1]))
            self.logger.info(u'Base URL: {0}'.format(base_url))
            self.logger.info(u'Protocol version: {0}'.format(protocol_version))
            self.logger.info(u'Granularity: {0}'.format(granularity))
            self.logger.info(u'Compression: {0}'.format(compression))
            self.logger.info(u'Deleted record: {0}'.format(deleted_record))

            records_count = 0
            deleted_count = 0
            records_list = list()
            parsed_records_list = list()

            # we're not interested in all sets, so we must iterate over the ones we have and want to crawl
            if repository[2] is not None:
                self.logger.info(u'Fetching set {0}...'.format(repository[2]))
                records_list = client.listRecords(metadataPrefix=METADATA, set=repository[2])
            else:
                records_list = client.listRecords(metadataPrefix=METADATA)
            if records_list is not None:
                for record in records_list:
                    records_count += 1
                    if record[0].isDeleted():
                        deleted_count += 1
                    if record[1] is not None:
                        parsed_records_list.append(tostring(record[1].element()))
                self.logger.info(
                    u'Retrieved {0} records from set {1} where {2} were deleted'.format(records_count, repository[2],
                                                                                        deleted_count))
            if not exists(''.join(['files/', repository_name_normalized, '/'])):
                self.logger.info('Creating storage folder for {0}...'.format(repository_name))
                makedirs(''.join(['files/', repository_name_normalized, '/']))

            self.logger.info(u'Creating storage files...')
            meta_file = open(''.join(['files/', repository_name_normalized, '/metadata.xml']), 'w')
            metadata[repository[2] + '_records_number'] = records_count
            metadata[repository[2] + '_deleted_number'] = deleted_count
            meta_file.write(tostring(dict_to_xml('metadata', metadata)))
            meta_file.close()

            record_file = open(''.join(
                ['files/', repository_name_normalized, '/', repository_name_normalized, '_', repository[2], '.xml']),
                'w')
            record_file.write(''.join(parsed_records_list))
            record_file.close()

        except NoRecordsMatchError, nrme:
            self.logger.error(u'{0} on repository {1}'.format(nrme.message, repository_name))

            # add url to unvisited_url and ask retrieval to try to crawl them again
            if nrme.message == 'No matches for the query':
                self.unvisited_repository.append(repository)
Example #45
0
g.namespace_manager.bind('dc', DC)
g.namespace_manager.bind('dct', DCT)


if len(sys.argv) != 4:
    print >>sys.stderr, "Usage: %s <oai-pmh-provider> <set-name> <namespace-URI>" % sys.argv[0]
    sys.exit(1)

provider, setname, urins = sys.argv[1:]
metans = urins[:-1] + "-meta/"

g.namespace_manager.bind(metans.split('/')[-2], Namespace(metans))

oai = Client(provider, registry)
#recs = oai.listRecords(metadataPrefix='marc21', set=setname, from_=datetime(2014,10,1))
recs = oai.listRecords(metadataPrefix='marc21', set=setname)

LANGMAP = {
    'fin': 'fi',
    'swe': 'sv',
}

# temporary dicts to store label/URI mappings between passes
labelmap = {}    # key: prefLabel, val: URIRef
relationmap = {} # key: prefLabel, val: [ (property, prefLabel), ... ]

RELMAP = { # MARC21 control field w value to RDF property + inverse
    'g': (SKOS.broader, SKOS.narrower),
    'h': (SKOS.narrower, SKOS.broader),
#    'a': (DCT.replaces, DCT.isReplacedBy),
#    'b': (DCT.isReplacedBy, DCT.replaces),
def oai_metadata(oai_endpoint):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(oai_endpoint, registry)
    return make_graphs(client.listRecords(metadataPrefix='oai_dc'))
def get_names(dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"
    # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc)

    edm_reader = MetadataReader(
        fields={
            'title': ('textList', record_prefix + '/dc:title/text()'),
            'creator': ('textList', record_prefix + '/dc:creator/text()'),
            'subject': ('textList', record_prefix + '/dc:subject/text()'),
            'description':
            ('textList', record_prefix + '/dc:description/text()'),
            'publisher': ('textList', record_prefix + '/dc:publisher/text()'),
            'contributor':
            ('textList', record_prefix + '/dc:contributor/text()'),
            'date': ('textList', record_prefix + '/dc:date/text()'),
            'type': ('textList', record_prefix + '/dc:type/text()'),
            'format': ('textList', record_prefix + '/dc:format/text()'),
            'identifier':
            ('textList', record_prefix + '/dc:identifier/text()'),
            'source': ('textList', record_prefix + '/dc:source/text()'),
            'language': ('textList', record_prefix + '/dc:language/text()'),
            'relation': ('textList', record_prefix + '/dc:relation/text()'),
            'coverage': ('textList', record_prefix + '/dc:coverage/text()'),
            'rights': ('textList', record_prefix + '/dc:rights/text()'),
            'spatial': ('textList', record_prefix + '/dc:spatial/text()'),
            'objectId': ('textList', record_prefix + '/@rdf:about'),
        },
        namespaces={
            'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms/',
            'dct': 'http://purl.org/dc/terms/',
            'edm': 'http://www.europeana.eu/schemas/edm/',
            'foaf': 'http://xmlns.com/foaf/0.1/',
            'owl': 'http://www.w3.org/2002/07/owl#',
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
            'skos': 'http://www.w3.org/2004/02/skos/core#',
            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
            'ore': 'http://www.openarchives.org/ore/terms/'
        })

    names = []
    identifier = []

    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry)
        # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21'

        for record in client.listRecords(metadataPrefix='edm', set=dataname):
            output = record[1].getMap()

            if output['creator'] != []:

                names.append([output['creator'][0]])
                identifier.append(
                    [output['creator'][0], output['objectId'][0]])

            if output['contributor'] != []:

                names.append([output['contributor'][0]])
                identifier.append(
                    [output['contributor'][0], output['objectId'][0]])

    print(names)

    return identifier
Example #48
0
def oai_metadata(oai_endpoint):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(oai_endpoint, registry)
    return make_graphs(client.listRecords(metadataPrefix='oai_dc'))
def transfer_experiment(source):
    """
    Pull public experiments from source into current mytardis.
    """

    #TODO: Cleanup error messages
    #TODO: does not transfer liences as not part of METS format.
    #NOTE: As this is a pull we trust the data from the other tardis
    # Check identity of the feed
    from oaipmh.client import Client
    from oaipmh import error
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    from django.core.cache import cache
    from django.utils.hashcompat import md5_constructor as md5

    # The cache key consists of the task name and the MD5 digest
    # of the feed URL.
    cache_key = md5("token").hexdigest()
    lock_id = "%s-lock-%s" % ("consume_experiment", cache_key)
    LOCK_EXPIRE = 60 * 5
    # cache.add fails if if the key already exists
    acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE)
    # memcache delete is very slow, but we have to use it to take
    # advantage of using add() for atomic locking
    release_lock = lambda: cache.delete(lock_id)

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    source_url = "%s/apps/oaipmh/?verb=Identify" % source

    client = Client(source_url, registry)
    try:
        identify = client.identify()
    except AttributeError as e:
        msg = "Error reading repos identity: %s:%s" % (source, e)
        logger.error(msg)
        raise ReposReadError(msg)
    except error.ErrorBase as e:
        msg = "OAIPMH error: %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except URLError as e:
        logger.error(e)
        raise
    repos = identify.baseURL()
    import urlparse
    repos_url = urlparse.urlparse(repos)
    dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc)
    if dest_name != source:
        msg = "Source directory reports incorrect name: %s" % dest_name
        logger.error(msg)
        raise BadAccessError(msg)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(
        source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc",
        registry)
    try:
        exps_metadata = [
            meta for (header, meta,
                      extra) in client.listRecords(metadataPrefix='oai_dc')
        ]
    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    local_ids = []
    for exp_metadata in exps_metadata:
        exp_id = exp_metadata.getField('identifier')[0]
        user = exp_metadata.getField('creator')[0]

        found_user = _get_or_create_user(source, user)

        #make sure experiment is publicish
        try:
            xmldata = getURL("%s/apps/reposproducer/expstate/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        try:
            exp_state = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not exp_state in [
                Experiment.PUBLIC_ACCESS_FULL,
                Experiment.PUBLIC_ACCESS_METADATA
        ]:
            msg = 'cannot ingest private experiments.' % exp_id
            logger.error(msg)
            raise BadAccessError(msg)

        # Get the usernames of isOwner django_user ACLs for the experiment
        try:
            xmldata = getURL("%s/apps/reposproducer/acls/%s/" %
                             (source, exp_id))

        except HTTPError as e:
            msg = "Cannot get acl list of experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)
        try:
            acls = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse acl list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        owners = []
        for acl in acls:
            if acl['pluginId'] == 'django_user' and acl['isOwner']:
                user = _get_or_create_user(source, acl['entityId'])
                owners.append(user.username)
            else:
                # FIXME: skips all other types of acl for now
                pass

        # Get the METS for the experiment
        metsxml = ""
        try:
            metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" %
                             (source, exp_id))
            #metsxml = getURL("%s/experiment/metsexport/%s/"
            #% (source, exp_id))

        except HTTPError as e:
            msg = "cannot get METS for experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)

        # load schema and parametername for experiment keys
        try:
            key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE)
        except Schema.DoesNotExist as e:
            msg = "No ExperimentKeyService Schema found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            key_name = ParameterName.objects.get(name=settings.KEY_NAME)
        except ParameterName.DoesNotExist as e:
            msg = "No ExperimentKeyService ParameterName found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            xmldata = getURL("%s/apps/reposproducer/key/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get key of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not xmldata:
            logger.warn(
                "Unable to retrieve experiment %s key.  Will try again later" %
                exp_id)
            return

        try:
            key_value = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse key list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not key_value:
            logger.warn(
                "Unable to retrieve experiment %s key value.  Will try again later"
                % exp_id)
            return

        logger.debug("retrieved key %s from experiment %s" %
                     (key_value, exp_id))
        exps = Experiment.objects.all()

        got_lock = True
        if not acquire_lock():
            logger.warning("another worker has access to consume experiment")
            return

        duplicate_exp = 0
        for exp in exps:
            #logger.warn("exp = %s" % exp.id)
            params = ExperimentParameter.objects.filter(
                name=key_name,
                parameterset__schema=key_schema,
                parameterset__experiment=exp)
            #logger.warn("params.count() = %s" % params.count())
            if params.count() >= 1:
                key = params[0].string_value
                if key == key_value:
                    duplicate_exp = exp.id
                    #logger.warn("found duplicate for %s" % duplicate_exp)
                    break

        if duplicate_exp:
            logger.warn(
                "Found duplicate experiment form %s exp %s to  exp %s" %
                (source, exp_id, duplicate_exp))
            if got_lock:
                release_lock()
            return

        # TODO: Need someway of updating and existing experiment.  Problem is
        # that copy will have different id from original, so need unique identifier
        # to allow matching

        # We have not pulled everything we need from producer and are ready to create
        # experiment.

        # Make placeholder experiment and ready metadata
        e = Experiment(
            title='Placeholder Title',
            approved=True,
            created_by=found_user,
            public_access=exp_state,
            locked=False  # so experiment can then be altered.
        )
        e.save()

        # store the key
        #eps, was_created = ExperimentParameterSet.objects.\
        #    get_or_create(experiment=e, schema=key_schema)
        #if was_created:
        #    logger.warn("was created")
        #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps,
        #    name=key_name,
        #    string_value=key_value)
        #if was_created:
        #    logger.warn("was created again")
        #ep.save()

        if got_lock:
            release_lock()

        local_id = e.id
        filename = path.join(e.get_or_create_directory(), 'mets_upload.xml')
        f = open(filename, 'wb+')
        f.write(metsxml)
        f.close()

        # Ingest this experiment META data and isOwner ACLS
        eid = None
        try:
            eid, sync_path = _registerExperimentDocument(filename=filename,
                                                         created_by=found_user,
                                                         expid=local_id,
                                                         owners=owners)
            logger.info('=== processing experiment %s: DONE' % local_id)
        except:
            # FIXME: what errors can mets return?
            msg = '=== processing experiment %s: FAILED!' \
                % local_id
            logger.error(msg)
            raise MetsParseError(msg)

        # FIXME: if METS parse fails then we should go back and delete the placeholder experiment

        exp = Experiment.objects.get(id=eid)

        # so that tardis does not copy the data
        for datafile in exp.get_datafiles():
            datafile.stay_remote = True
            datafile.save()

        #import nose.tools
        #nose.tools.set_trace()
        # FIXME: reverse lookup of URLs seem quite slow.
        # TODO: put this information into specific metadata schema attached to experiment
        exp.description += get_audit_message(source, exp_id)
        exp.save()

        local_ids.append(local_id)
    return local_ids
Example #50
0
    es.index(
        index="hhs",
        doc_type="oai",
        id=record['id'],
        body={
            "title": record['title'],
            "url": getUrl(record['url']),
            "genre": record['genre'],
            "name": _getNames(record['name']),
            "language": record['language'],
            "topics": record['topics'],
            "abstract": record['abstract'],
            "date": datestamp,
        }
    )

for record in client.listRecords(metadataPrefix='mods'):
    #print record
    if record[1] is not None:
        datestamp = record[0].datestamp()
        record = record[1].getMap()

        print datestamp, record
        #print {record['title']}, {record['url'][1]}, record['genre'], ', '.join(record['name']), record['language'], ', '.join(record['topics']), record['abstract']

        doc_url = getUrl(record['url'])
        if doc_url is not None:
            getThumb(doc_url)
        #esIndex(record, datestamp)
        #raw_input("Press Enter to continue...")
def harvest(metadata_set, dest_folder, log_file, content_type,
            from_date, until_date):

    #############################
    # ### FILESYSTEM CHECKS ### #
    #############################
    try:
        if not os.path.isdir(dest_folder):
            os.makedirs(dest_folder)
        # Verify write permission inside the folder:
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create destination folder: %s" % dest_folder)

    try:
        test_path = os.path.join(dest_folder, '__test_permissions__')
        os.makedirs(test_path)
        os.rmdir(test_path)
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to use destination folder: %s" % dest_folder)

    try:
        log_handle = open(log_file, 'a+')
        log_handle.close()
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create log_file: %s" % log_file)

    #################################
    # ### OAI-PMH CONFIGURATION ### #
    #################################
    URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do'
    metadata_prefix = 'efg'

    ###################################
    # ### OPEN OAI-PMH CONNECTION ### #
    ###################################
    registry = MetadataRegistry()
    registry.registerReader(metadata_prefix, oai_dc_reader)

    #print ("URL=" + str(URL))

    client = Client(URL, registry)

    ####################################
    # ### CHECK IF THIS SET EXISTS ### #
    ####################################
    set_found = False
    for s in client.listSets():
        if metadata_set == s[0]:
            set_found = True

    if not set_found:
        log.exit("Unable to find this set: %s" % metadata_set)

    #############################
    # ### RETRIEVE METADATA ### #
    #############################

    if from_date is not None:
        from_date = parse_date(from_date)
        if from_date is None:
            log.exit("Unable to convert from date")

    if until_date is not None:
        until_date = parse_date(until_date)
        if until_date is None:
            log.exit("Unable to convert until date")

    report_data = {
        'downloaded': 0,
        'filtered': 0,
        'saved': 0,
        'saved_files': [],
        'missing_sourceid': [],
        'wrong_content_type': []
    }
    timestamp = int(1000 * time.time())
    log.info("Retrieving records for %s..." % metadata_set)
    try:
        records = client.listRecords(
            metadataPrefix=metadata_prefix,
            set=metadata_set,
            from_=from_date,
            until=until_date)
    except NoRecordsMatchError as e:
        log.exit(e)

    log.info("Records retrieved, extracting...")
    try:

        for record in records:
            element = record[1].element()
            # Obtained eTree is based on namespaced XML
            # Read: 19.7.1.6. Parsing XML with Namespaces
            # https://docs.python.org/2/library/xml.etree.elementtree.html

            # find(match)
            # Finds the first subelement matching match.
            #   match may be a tag name or path.
            #   Returns an element instance or None.

            # findall(match)
            # Finds all matching subelements, by tag name or path.
            #   Returns a list containing all matching elements
            #   in document order.

            report_data['downloaded'] += 1

            if report_data['downloaded'] % 100 == 0:
                print('.', end='', flush=True)

                if report_data['downloaded'] % 5000 == 0:
                    print(
                        ' %s downloaded - %s saved' % (
                            report_data['downloaded'],
                            report_data['saved']
                        ), flush=True)

            efgEntity = element.find(tag("efgEntity"))
            if efgEntity is None:
                # log.warning("efgEntity not found, skipping record")
                continue
            avcreation = efgEntity.find(tag("avcreation"))
            nonavcreation = efgEntity.find(tag("nonavcreation"))

            if avcreation is not None:
                manifestation = avcreation.find(tag("avManifestation"))
                recordSource = avcreation.find(tag("recordSource"))
                keywords = avcreation.findall(tag("keywords"))
                title_el = avcreation.find(tag("identifyingTitle"))
                title = (title_el.text
                         if title_el is not None
                         else "Unknown title")
            elif nonavcreation is not None:
                manifestation = nonavcreation.find(tag("nonAVManifestation"))
                recordSource = nonavcreation.find(tag("recordSource"))
                keywords = nonavcreation.findall(tag("keywords"))
                title_el = nonavcreation.find(tag("title"))
                title = (title_el.find(tag("text")).text
                         if title_el is not None
                         else "Unknown title")
            else:
                title = "Unknown title"
                # log.warning("(non)avcreation not found, skipping record")
                continue

            filter_keyword = "IMediaCities"
            is_good = False
            for keyword in keywords:
                term = keyword.find(tag("term"))
                if term.text == filter_keyword:
                    is_good = True
                    break

            if not is_good:
                continue

            report_data['filtered'] += 1

            if manifestation is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("avManifestation not found, skipping record")
                continue

            if content_type is not None:
                content_type = content_type.lower()

                item = manifestation.find(tag("item"))
                if item is None:
                    # missing <item> => type cannot be found
                    report_data['wrong_content_type'].append(title)
                    continue

                item_type = item.find(tag("type"))
                if item_type is None:
                    # missing <type>
                    report_data['wrong_content_type'].append(title)
                    continue

                if item_type.text.lower() != content_type:
                    # wrong type
                    report_data['wrong_content_type'].append(title)
                    continue



            # ATTENZIONE: il sourceID va preso dal recordSource che sta
            #              sotto avcreation/nonavcreation e NON sotto
            #               avManifestation/nonAVManifestation

            #recordSource = manifestation.find(tag("recordSource"))
            if recordSource is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("recordSource not found, skipping record")
                continue

            sourceID = recordSource.find(tag("sourceID"))
            if sourceID is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("sourceID not found, skipping record")
                continue

            content = etree.tostring(efgEntity, pretty_print=False)

            # id_text = urllib.parse.quote_plus(sourceID.text.strip())
            # replace non alpha-numeric characters with a dash
            id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip())
            # fine cinzia

            filename = "%s_%s_%s.xml" % (
                metadata_set,
                id_text,
                timestamp
            )
            filepath = os.path.join(dest_folder, filename)
            # with open(filepath, 'wb') as f:
            with codecs.open(filepath, 'wb', "utf-8") as f:
                f.write(content.decode('utf-8'))
            # OLD
            #with codecs.open(filepath, 'wb', "utf-8") as f:
            #    f.write(html.unescape(content.decode('utf-8')))

            report_data['saved'] += 1
            report_data['saved_files'].append(filename)

    except NoRecordsMatchError as e:
        log.warning("No more records after filtering?")
        log.warning(e)

        # ###################
        # Write report file
        # ###################

        # the procedure writes a report file containing the results
        #     of the harvesting:
        # the list of records that do not contain the record ID
        #     (by writing the content of the element title)

    with open(log_file, 'w+') as f:
        json.dump(report_data, f)

    f.close()

    # Just to close previous dot line
    print("")

    log.info("""

%s records from set [%s] downloaded
open log file [%s] for details
""" % (report_data['saved'], metadata_set, log_file)
    )
Example #52
0
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0):
    '''
    Pull articles using the Open Archives Initiative protocol
    
    subject    - String defining the subset of the main section
    section    - String defining the main section (typically physics or nothing)
    start      - A datetime.datetime object restricting the starting date of returned articles
    end        - A datetime.datetime object restricting the ending date of the returned articles
    sleep_time - A number specifying how many ms to wait between the record queries
    
    Examples

       oaiSpider("hep-ex", "physics")
       ==> returns all HEP experiment articles
       
       oaiSpider("cs", "", datetime(2011,06,24))
       ==> returns all computer science articles submitted after June 24th, 2011
       
       oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24))
       ==> returns all HEP phenomenology articles submitted before June 24th, 2011

    Returns a list of dictionaries containing the article metadata
    '''

    from oaipmh.client import Client
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    base_url = "http://export.arxiv.org/oai2"
    output = []

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    if section == None:
        section = ""
    if len(section) > 0 and section[-1] != ":":
        section += ":"

    # sets = client.listSets()
    # for entry in sets:
    #     print entry
    
    ### OAIPMH module sucks donkey balls
    # Causes some error when I use the from_ or until keys
    records = client.listRecords(metadataPrefix='oai_dc'
                                 , set='%s%s' % (section, subject)
                                 , from_=start
                                 #, from_=datestamp
                                 , until=end
                                 )
    
    counter = 0
    
    for (header, metadata, aux) in records:
        
        print counter

        # for key in  metadata._map.keys():
        #     print key, metadata[key]

        output.append({"title"    : cleanText(metadata["title"][0]),
                       "abstract" : cleanText(metadata["description"][0]),
                       "date"     : convertDate(max(metadata["date"])),
                       "subject"  : subject,
                       "url"      : metadata["identifier"][0],
                       "authors"  : "; ".join( metadata['creator']),
                       })

        print output[-1]
        counter += 1
        
        # break
        # if counter > 15:
        #     break
        time.sleep(sleep_time)

    return output
Example #53
0
    """Returns the PyMARC record from the OAI structure for MARC XML"""
    def __call__(self, element):
        print element[0][1].text
        handler = marcxml.XmlHandler()
        marcxml.parse_xml(StringIO(tostring(element[0])), handler)
        return handler.records[0]


marcxml_reader = MARCXMLReader()

# Defining of metadata Readers in the Registry

from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('marc21', marcxml_reader)

#### OAI-PMH Client processing

oai = Client('http://snape.mzk.cz/OAI-script', registry)

recs = oai.listRecords(metadataPrefix='marc21', set='MZK03')

for rec in recs:
    print rec[0].identifier()
    r = rec[1]  # Get pyMARC representation
    print r['856']
    print r['034']
    print r['008']
    print
Example #54
0
registry = metadata.MetadataRegistry()
registry.registerReader("marc21", marcxml_reader)

g = Graph()
g.namespace_manager.bind("skos", SKOS)
g.namespace_manager.bind("cn", CN)
g.namespace_manager.bind("dc", DC)
g.namespace_manager.bind("dct", DCT)
g.namespace_manager.bind("rdaa", RDAA)
g.namespace_manager.bind("rdac", RDAC)


oai = Client("https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi", registry)

# recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2013,1,1))
recs = oai.listRecords(metadataPrefix="marc21", set="corporateNames")

lang_cache = {}
lcf = codecs.open(LANG_CACHE_FILE, "r", "utf-8")
for line in lcf:
    lang, text = line.rstrip("\r\n").split("\t")
    if lang == "":
        lang = None
    lang_cache[text] = lang
lcf.close()

label_to_uri = {}


def guess_language(text):
    """return the most likely language for the given unicode text string"""
Example #55
0
def indexCollection(URL, url_base, metadata_prefix, collection, action):
    #pull data from OAI endpoint
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(URL, registry, force_http_get=True)

    harvested_data = []
    for record in client.listRecords(metadataPrefix=metadata_prefix,
                                     set=collection):
        if not record[0].isDeleted():
            fields = record[1].getMap()
            if fields['subject']:
                fields['subjects'] = fields['subject'][0].split(';')
                del fields['subject']
            fields['set'] = record[0].setSpec()
            identifier = record[0].identifier().split(':')[2]
            fields[
                'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/'
            harvested_data.append(fields)

    if action is 'reindex':
        es.indices.delete(index='digital_collection_recs', ignore=[400, 404])

        mapping = {
            "mappings": {
                "_doc": {
                    "properties": {
                        "title": {
                            "type": "text"
                        },
                        "creator": {
                            "type": "text"
                        },
                        "subjects": {
                            "type": "text"
                        },
                        "description": {
                            "type": "text"
                        },
                        "publisher": {
                            "type": "text"
                        },
                        "contributor": {
                            "type": "text"
                        },
                        "date": {
                            "type": "text"
                        },
                        "type": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "format": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "identifier": {
                            "type": "text"
                        },
                        "source": {
                            "type": "text"
                        },
                        "language": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "relation": {
                            "type": "text"
                        },
                        "coverage": {
                            "type": "text"
                        },
                        "rights": {
                            "type": "text"
                        },
                        "set": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "image_url_base": {
                            "type": "text"
                        }
                    }
                }
            }
        }
        es.indices.create(index='digital_collection_recs', body=mapping)

    helpers.bulk(es,
                 harvested_data,
                 index='digital_collection_recs',
                 doc_type='_doc')

    return "success"
Example #56
0
# arXiv OAI url we will query
URL = "http://export.arxiv.org/oai2"
# Create OAI client; now we're all set for listing some records
client = Client(URL, registry)

# Open files for writing
titlef = open(title_file, 'w')
#abstractf = open(abstr_file, 'w')

# Keep track of run-time and number of papers
start_time = time.time()
count = 0

# Harvest
for record in client.listRecords(metadataPrefix='oai_dc', set=section):
    try:
        # Extract the title
        title = record[1].getField('title')[0]
        # Extract the abstract
        abstract = record[1].getField('abstract')[0]
        # And get the date (this is stored as yyyy-mm-dd in the arXiv metadata)
        date = record[1].getField('date')[0]
        year = int(date[0:4])
        month = int(date[5:7])

        # Write to file (add year info to the titles)
        titlef.write("%d %d " % (year, month) + title + "\n")
        #    abstractf.write(abstract + "\n")

        count += 1
Example #57
0
from rdflib import URIRef, Graph, Literal, Namespace
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

easy_id = Namespace('https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:')

def easy_url(oai_id):
    namespace, dataset = oai_id.rsplit(':', 1)
    if namespace != 'oai:easy.dans.knaw.nl:easy-dataset':
        raise(Exception("Unknown namespace: {0}".format(namespace)))
    return easy_id[dataset]

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client('http://easy.dans.knaw.nl/oai/', registry)
graph = Graph()
graph.namespace_manager.bind('dc11', 'http://purl.org/dc/elements/1.1/')
dc11 = Namespace('http://purl.org/dc/elements/1.1/')
# max_count = 30000
for count, (header, metadata, _) in enumerate(client.listRecords(metadataPrefix='oai_dc')):
    # if count >= max_count:
    #   break
    if metadata is not None:
        metadata_fields = metadata.getMap().iteritems()
        s = easy_url(header.identifier())
        for p, vv in metadata_fields:
            for v in vv:
                graph.add((s, dc11[p], Literal(v)))

graph.serialize('easy-lod.nt', format='nt')
Example #58
0
#!/usr/bin/env python

# Dependencies
# pioai - OAI-PMH Python Module - http://infrae.com/download/OAI/pyoai

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

oaiSourceURL = 'http://digitalrepository.unm.edu/do/oai/'

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(oaiSourceURL, registry)

with open("output.txt","w") as outfile:
    for record in client.listRecords(metadataPrefix='oai_dc', max=10):
        outfile.write(repr(record)+"\n")