def list_records(target, date_from, date_until, setspec): #logging.debug("list_records") if target is not None: client = Client(target['url'], registry) # todo : clean this, find simplified cases if date_from is not None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) elif date_from is not None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until) elif date_from is not None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec) elif date_from is None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec) elif date_from is not None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from) elif date_from is None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until) elif date_from is None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec) elif date_from is None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix']) if records is not None: for record in records: yield convert_record(record, target['metadata_prefix'], target['title'])
def list_records(target, date_from, date_until, setspec): logging.debug("list_records") if target is not None: client = Client(target['url'], registry) # todo : clean this, find simplified cases if date_from is not None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) elif date_from is not None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until) elif date_from is not None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec) elif date_from is None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec) elif date_from is not None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from) elif date_from is None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until) elif date_from is None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec) elif date_from is None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix']) if records is not None: for record in records: yield convert_record(record, target['metadata_prefix'], target['title'])
def list_records(target, date_from, date_until, setspec): if target is not None: client = Client(target['url'], registry) # todo : clean this, find simplified cases if date_from is not None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) elif date_from is not None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until) elif date_from is not None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec) elif date_from is None and date_until is not None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec) elif date_from is not None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from) elif date_from is None and date_until is not None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until) elif date_from is None and date_until is None and setspec is not None: records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec) elif date_from is None and date_until is None and setspec is None: records = client.listRecords(metadataPrefix=target['metadata_prefix']) results = [] if records is not None: results = [] for record in records: results.append(convert_record(record, target['metadata_prefix'], target['title'])) return results
def scrape(self): raise Exception("not finished") registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) url = self.setting('pmh-endpoint') client = Client(url, registry) print " OAI Repository", url print " Available sets:" for s in client.listSets(): print " ", s oai_set = self.setting('set') oai_from = self.setting('from') oai_until = self.setting('until') kwargs = {} if oai_set: kwargs['set'] = oai_set if oai_from is not None: date_args = [int(arg) for arg in oai_from.split("-")] kwargs['from_'] = datetime.datetime(*date_args) if oai_until is not None: date_args = [int(arg) for arg in oai_until.split("-")] kwargs['until'] = datetime.datetime(*date_args) records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)] data_filepath = os.path.join(self.work_dir(), self.setting('data-file')) with open(data_filepath, 'wb') as f: print " picking", len(records), "records" pickle.dump(records, f)
def insertAll(time, time2): registry = MetadataRegistry() registry.registerReader('arXivRaw', arXivRaw_reader) client = Client(URL, registry) client.updateGranularity() list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2) errors = 0 for a in list: #a = list.next() try: title = '\n'.join(a[1]['title']) sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ') abstract = '\n'.join(a[1]['abstract']) url = 'http://arxiv.org/abs/' + a[1]['id'][0] date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z') authors = a[1]['authors'][0]# '; '.join(a[1]['keynames']) abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2) print title print sr2 print abstract print url print date print authors insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2) except: print 'ERROR' print a errors = errors+1 print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0): base_url = "http://export.arxiv.org/oai2" output = list() registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end) for _, md, _ in records: # print md.getField("title") # checks for the case in 2010 when there is no title for something if md is not None: txt_dict = {"title": md["title"], "abstract": md["description"], "date": md["date"], "subject": md["subject"], "url": md["identifier"], "authors": md['creator']} output.append(txt_dict) time.sleep(sleep_time) return output
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, metadata_registry) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def pull_data(source): list_of_records = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_date = [] exps_metadata = [] for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc'): exps_date.append(str(header._datestamp)) exps_metadata.append(meta) logger.debug('Date=%s' % header._datestamp) except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return exp_counter = 0 for exp_metadata in exps_metadata: user_id = exp_metadata.getField('creator')[0] user_profile = json.loads(_get_user(source, user_id)) data_tobe_indexed = dict(user_profile) data_tobe_indexed['user_id'] = user_id exp_id = exp_metadata.getField('identifier')[0] description = exp_metadata.getField('description')[0] title = exp_metadata.getField('title')[0] if settings.EXPERIMENT_PATH[0] == '/': settings.EXPERIMENT_PATH = settings.EXPERIMENT_PATH[1:] experiment_url = os.path.join(source, settings.EXPERIMENT_PATH % exp_id) data_tobe_indexed['experiment_id'] = exp_id data_tobe_indexed['experiment_title'] = title data_tobe_indexed['experiment_description'] = description data_tobe_indexed['experiment_url'] = experiment_url data_tobe_indexed['id'] = experiment_url data_tobe_indexed['experiment_date'] = exps_date[exp_counter] exp_counter += 1 for k, v in data_tobe_indexed.items(): logger.debug('%s = %s' % (k, v)) logger.debug('') list_of_records.append(json.dumps(data_tobe_indexed)) return list_of_records
def harvest_oai_collection_records(self, collection): records = [] try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(collection.community.repository.base_url, registry) records = client.listRecords( metadataPrefix='oai_dc', set=collection.identifier) except: return return records
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r, record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def read_base_records(self): registry = MetadataRegistry() registry.registerReader('base_dc', base_dc_reader) client = Client('http://doai.io/oai', registry) for header, record, _ in client.listRecords(metadataPrefix='base_dc'): # only process records for which base was unsure if '2' not in record['oa']: continue # extract splash_url for link in record['identifier']: metadata = {'base_oa':''.join(record['oa']), 'splash_url':link, 'from_identifier':header.identifier()} yield self.filter_url(link,metadata, looking_for='any')
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r,record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({'handle': handle[0]}) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n') if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix=self._prefix, from_=from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({ 'handle' : handle[0] }) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n'); if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix = self._prefix, from_ = from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi': 'http://www.openbeelden.nl/oai/', 'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms', }) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi' : 'http://www.openbeelden.nl/oai/', 'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/', 'dcterms' : 'http://purl.org/dc/terms', } ) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def iter_items(self, partition): """ Partition is an OAI-PMH endpoint """ # source = "oai:%s" % partition registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(partition, registry) for record in client.listRecords(metadataPrefix='oai_dc'): header, metadata, _ = record if header.isDeleted(): continue # _id = header.identifier() # date = header.datestamp() meta = metadata.getMap() # TODO: there are much validation and heuristics to be done here! # format0 = (meta.get("format") or [None])[0] # if not format0: # continue # if format0 not in ("application/pdf", ): # continue url0 = (meta.get("identifier") or [None])[0] if not url0: continue title0 = (meta.get("title") or [""])[0].encode("utf-8") desc0 = (meta.get("description") or [""])[0].encode("utf-8") # TODO: validate that the url0 is not on another domain?! yield url0, {}, "html", 2, """ <html><head><title>%s</title></head><body>%s</body></html> """ % (title0, desc0)
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def acquire_and_publish_documents(oai_url, publish_url, reader, prefix, pwd): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(oai_url, registry) documents = [] count = 0 for record in client.listRecords(metadataPrefix=prefix): header = record[0] metadata = record[1] rawMetadata = urllib2.urlopen("{0}?verb=GetRecord&metadataPrefix={1}&identifier={2}".format(oai_url,prefix,header.identifier())).read() # re-format Jorum id identifier = header.identifier() identifier = identifier.replace("oai:dspace.jorum.ac.uk:","") uri = "http://dspace.jorum.ac.uk/xmlui/handle/" + identifier print(uri) # create keys from dc.subject terms fo = StringIO.StringIO(rawMetadata) tree = parse(fo) # can only parse files or file objects keys = [] for elem in tree.getiterator(): # print("tag " + str(elem.tag)) # print("text " + str(elem.text)) if elem.tag == "{http://purl.org/dc/elements/1.1/}subject": keys.append(elem.text) fo.close() print(keys) print("\n") value = convert_to_envelope(metadata, rawMetadata, uri, keys) # print (value) # print(dir(header)) if value != None: documents.append(value) count += 1 if (count % 10 == 0) or (count == 3): publish_documents(publish_url, documents, pwd) documents = [] publish_documents(publish_url, documents, pwd)
#try this and see if it works; if it does resumption tokens right, this should work fine. chunk = timedelta(days=1) oneday = timedelta(days=1) #TODO: clearly they don't do this whole "ordered" thing. Grab records by month or year or something instead of all at once. #TODO: luckily, once we've done a full slurp, we only need to remember when the last full slurp was and start since then. But if interrupted, we need to start back from where the last *full* slurp was, due to the ordering problem. #TODO: structure this better, with the try effectively moved much further above. Really, move a lot more into functions try: current = start #TODO: make a nice little generator so I can use a for loop while current <= datetime.now(): print >>sys.stderr, "fetching records @", now(), "starting with", current.strftime('%Y-%m-%d') try: records = client.listRecords(metadataPrefix='oai_dc', from_=current, until=(current + chunk)) except NoRecordsMatchError: print >>sys.stderr, "no records for this chunk, continuing to next" current += chunk store.write_last(current) continue print >>sys.stderr, "record fetch finished @", now() for index, (header, metadata, _) in enumerate(records, start=1): store.write_record(header, metadata) if index == 1 or index % 1000 == 0: print >>sys.stderr, " wrote record", index, "of", header.datestamp().strftime('%Y-%m-%d'), "with id", header.identifier() current += chunk store.write_last(current) finally: print >>sys.stderr, "closing store" store.close()
class NSDLDCImport(object): ''' Class exports the required fields from the UCAR OAI-PMH data repository using NSDL_DC. ''' def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields = self._fields, namespaces = self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry) def _format(self, doc): value = {} # merge all the fields for (fieldname, fieldconfig) in self._fieldMap.items(): if fieldconfig["type"] == "const" and "const" in fieldconfig: value[fieldname] = fieldconfig["const"] elif fieldconfig["type"] == "[string]" and len(fieldconfig["fields"]) > 0: value[fieldname] = [] for field in fieldconfig["fields"]: value[fieldname].extend(doc.getField(field)) elif fieldconfig["type"] == "string" and len(fieldconfig["fields"]) > 0: value[fieldname] = "" for field in fieldconfig["fields"]: value[fieldname] += ", ".join(doc.getField(field)) elif fieldconfig["type"] == "boolean" and len(fieldconfig["fields"]) > 0: value[fieldname] = True for field in fieldconfig["fields"]: value[fieldname] &= doc.getField(field) return value def fetch_documents(self, range=10000): return_stuff = [] for record in self._client.listRecords(metadataPrefix=self._prefix): r = record[1] value = self._format(r) if value != None: return_stuff.append(value) if len(return_stuff) >= range: yield return_stuff return_stuff = []
if os.path.exists(LANG_CACHE_FILE): lcf = codecs.open(LANG_CACHE_FILE, 'r', 'utf-8') for line in lcf: lang, text = line.rstrip("\r\n").split("\t") if lang == '': lang = None lang_cache[text] = lang lcf.close() label_to_uri = {} # pass 1: convert MARC data to basic RDF oai = Client('https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2019,05,15)) recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames') for oaipmhrec in recs: convert_record(oaipmhrec) recs = oai.listRecords(metadataPrefix='marc21', set='meetingNames') for oaipmhrec in recs: convert_record(oaipmhrec) # pass 2: convert literal values to resources for prop in (relatedCorporateBody, predecessor, successor, hierarchicalSuperior): for s,o in g.subject_objects(prop): if isinstance(o, Literal): g.remove((s,prop,o)) # remove original res = label_to_uri.get(u"%s" % o, None) if res is None:
SETSPEC = sys.argv[3] else: SETSPEC = None registry = MetadataRegistry() registry.registerReader('mods', mods_reader) #registry.registerReader('didl', didl_reader) #registry.registerReader('oac_dc', oai_dc_reader) client = Client(URL, registry) record_count = 0 deleted_count = 0 if SETSPEC: records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC) else: records = client.listRecords(metadataPrefix=METADATA_PREFIX) for num, record in enumerate(records): record_count += 1 delinfo = '' if record[0].isDeleted(): deleted_count += 1 delinfo = '(deleted)' print '%0.6d %s %s' % (num, record[0].identifier(), delinfo) if record[1] is not None: # metadata = client.getMetadata(metadataPrefix='mods', identifier=record[0].identifier()) # print type(metadata), metadata.tag print "MAP: ", record[1].getMap() # print ' %s' % ';'.join(record[0].setSpec())
class Repository(object): """ Repository handles interaction with the various interfaces provided by the dspace repository. """ def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url) def _extractIdentifierBase(self, url): """ From a given URL, extract the OAI identifier base (hostname) """ return urlparse(url).hostname def _extractSet(self, handle): """ Determine the OAI set from a collection handle """ if not isinstance(handle, basestring): raise ValueError('Collection handles must be strings') return 'hdl_' + handle.replace('/', '_').replace(':', '_') def getName(self): """ Get the configured name of the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return self.oai.identify().repositoryName() def getCollections(self): """ Get a list of the collections in the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return map(lambda c: c[0:2], self.oai.listSets()) def getItemHandles(self, collection=None, **kw): """ Get item handles from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' for item in self.getItemIdentifiers(collection=collection, **kw): yield item.identifier().split(':', 2)[2] def getItemIdentifiers(self, collection=None, **kw): """ Get item identifiers from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listIdentifiers(**kw) def getItems(self, collection=None, **kw): """ Get full items from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listRecords(**kw) def getItem(self, handle=None, identifier=None, **kwargs): """ Get a single item from the OAI-PMH interface either by handle or identifier """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kwargs.setdefault('metadataPrefix', 'mets') if handle is None and identifier is None: raise ValueError('Either handle or identifier must be provided') if handle is not None: if identifier is not None: raise ValueError('Either a handle or identifier must be ' 'provided, not both') identifier = 'oai:%s:%s' % ( self.identifier_base, handle, ) return self.oai.getRecord(identifier=identifier, **kwargs) def getOAIItemIdentifier(self, handle): return 'oai:%s:%s' % (self._extractIdentifierBase( self.base_url), handle) def getSwordCollections(self): pass def getSwordCollection(self, args): pass
#!/usr/bin/env python # uses http://www.infrae.com/download/OAI/pyoai from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader URL = 'http://researchcommons.waikato.ac.nz/dspace-oai/request' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) for record in client.listRecords(metadataPrefix='oai_dc'): print record[0].identifier()
class ZoraAPI: METADATA_PREFIX = 'oai_dc' # In the constructor, we register to the ZORA API and initialize the necessary class variables def __init__(self, url): registry = MetadataRegistry() registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader) self.client = Client(url, registry) self.institutes = {} self.resource_types = [] self.load_institutes_and_types() # Returns the hierarchical dictionary of institutes def get_institutes(self): return self.institutes # Returns the list of resource types def get_resource_types(self): return self.resource_types # Loads all institutes and resource types. The institutes also get parsed into a hierarchical dictionary. def load_institutes_and_types(self): institutes_list = [] resource_type_list = [] for item in self.client.listSets(): split = item[1].split(' = ') if len(split) != 2: continue set_type, set_value = split if set_type == 'Subjects': institutes_list.append(set_value) elif set_type == 'Type': resource_type_list.append(set_value) institutes_dict = self.parse_institutes(institutes_list) self.institutes = institutes_dict self.resource_types = resource_type_list # Parses a list of institutes into a hierarchical dictionary @staticmethod def parse_institutes(institute_list_raw): institutes_dict = {} for institute_raw in institute_list_raw: institutes = institute_raw.split(': ') parent = institutes_dict for institute in institutes: if parent.get(institute) is None: parent[institute] = {} parent = parent[institute] return institutes_dict # Get all metadata dictionaries from ZORA def get_metadata_dicts(self, from_): record_list = self.get_records(from_) metadata_dict_list = self.parse_records(record_list) return metadata_dict_list # Gets one specific paper from the ZORA repository and returns the record of it def get_record(self, uid): record = self.client.getRecord(identifier=uid, metadataPrefix=ZoraAPI.METADATA_PREFIX) return record # Gets the papers from the ZORA repository and returns their records in form of a list def get_records(self, from_): args = {'metadataPrefix': ZoraAPI.METADATA_PREFIX} # Add the from_ argument if it is defined (this is used to get only the most recent papers/changes) if from_: args['from_'] = from_ # Get the relevant papers from ZORA and parse them record_list = [] try: print('Loading records from ZORA API...') record_iterator = self.client.listRecords(**args) record_list = [] count = 0 for record in record_iterator: record_list.append(record) count += 1 if is_debug() and count % 1000 == 0: print(str(count)) print(count) print('Done') except NoRecordsMatchError: print('No records were found') except RemoteDisconnected as error: print(error) except Exception as error: print(error) finally: return record_list # This method parses a list of records from ZORA in a easier to use metadata dictionary. def parse_records(self, record_list): metadata_dict_list = [] print('Parsing records...') for record in record_list: metadata_dict = self.parse_record(record) if metadata_dict: metadata_dict_list.append(metadata_dict) print('Done') return metadata_dict_list # This function parses a record into a dictionary with a similar structure of the Paper database object. # To do so, it turns some unnecessary lists into single values and parses the 'subject' field into 'ddcs' (dewey # decimal classifications), 'keywords' and 'institutes'. # # NOTE: It is not possible to parse the 'subject' field properly since we lack the ability to distinguish between # keywords and institutes (some institutes contain commas --> they will get recognized as lists of keywords). @staticmethod def parse_record(record): metadata_dict = {} metadata_dict['uid'] = record[0].identifier() # If there is no metadata, we assume that the paper has been deleted and store that information in the dict if not record[1]: metadata_dict['deleted'] = True return metadata_dict # If there is metadata available, we parse it into a convenient form metadata_dict = {**metadata_dict, **dict(record[1].getMap())} metadata_dict['title'] = metadata_dict['title'][ 0] if 'title' in metadata_dict and len( metadata_dict['title']) > 0 else None metadata_dict['creators'] = metadata_dict.pop( 'creator') if 'creator' in metadata_dict else [] # If the field 'subject' starts with three digits, it is a ddc (dewey decimal classification). If it contains a # comma-separated list, it is a list of keywords. Otherwise it is an institute. # # NOTE: There are some dewey decimal classifications that contain commas, therefore we check for the three # digits before we look for comma separated lists. Some institutes contain commas as well. This # leads to some institutes getting recognized as a list of keywords. With the information available this problem # unfortunately cannot be solved properly. institute_list = [] ddc_list = [] keyword_list = [] if 'subject' in metadata_dict: for item in metadata_dict['subject']: # If subject starts with three digits and a space, we assume its a dewey decimal classification regex = re.compile('^\d\d\d\s+\w') if regex.match(item): ddc_list.append(item) # If the subject has the same name as an institute, we assume it is an institute elif db.session.query(Institute).filter( Institute.name == item).first(): institute_list.append(item) # If it is none of the above, we assume that it is a comma-separated list of keywords else: for keyword in item.split(','): keyword_list.append(keyword) metadata_dict['institutes'] = institute_list metadata_dict['ddcs'] = ddc_list metadata_dict['keywords'] = keyword_list metadata_dict['description'] = metadata_dict['description'][ 0] if 'description' in metadata_dict and len( metadata_dict['description']) > 0 else None metadata_dict['publisher'] = metadata_dict['publisher'][ 0] if 'publisher' in metadata_dict and len( metadata_dict['publisher']) > 0 else None metadata_dict['date'] = metadata_dict['date'][ 0] if 'date' in metadata_dict and len( metadata_dict['date']) > 0 else None # We filter the 'type' field and only store the paper type type_list = metadata_dict.pop( 'type') if 'type' in metadata_dict else [] resource_type_list = [] for resource_type in type_list: if db.session.query(ResourceType).filter( ResourceType.name == resource_type).first(): resource_type_list.append(resource_type) metadata_dict['resource_types'] = resource_type_list metadata_dict['language'] = metadata_dict['language'][ 0] if 'language' in metadata_dict and len( metadata_dict['language']) > 0 else None metadata_dict['relation'] = metadata_dict['relation'][ 0] if 'relation' in metadata_dict and len( metadata_dict['relation']) > 0 else None return metadata_dict
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from urllib2 import Request, urlopen from StringIO import StringIO # url = 'http://export.arxiv.org/api/query?search_query=all:cs&start=0&max_results=10' # data = urllib.urlopen(url).read() # urlpdf = 'http://arxiv.org/pdf/1510.02262v1.pdf' # remoteFile = urlopen(Request(urlpdf)).read() # memoryFile = StringIO(remoteFile) # pdfFile = pyPdf.PdfFileReader(memoryFile) # data = pyPdf.PdfFileReader(file('http://arxiv.org/pdf/1510.02262v1.pdf','r')) # singles = [stemmer.stem(plural) for plural in plurals] from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader registry = MetadataRegistry() URL = "http://export.arxiv.org/oai2" registry.registerReader("oai_dc", oai_dc_reader) clt = Client(URL, registry) ic = 0 for record in clt.listRecords(metadataPrefix="oai_dc"): if ic > 10: break print record[1]["title"][0] record[1]["identifier"][0] # arxiv_id link ic += 1
class OaiPaperSource(PaperSource): # TODO: this should not inherit from PaperSource """ A paper source that fetches records from the OAI-PMH proxy (typically: proaixy). It uses the ListRecord verb to fetch records from the OAI-PMH source. Each record is then converted to a :class:`BarePaper` by an :class:`OaiTranslator` that handles the format the metadata is served in. """ def __init__(self, endpoint, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param endpoint: the address of the OAI-PMH endpoint to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.registry.registerReader('citeproc', citeproc_reader) self.client = Client(endpoint, self.registry) self.client._day_granularity = day_granularity if settings.PROAIXY_API_KEY: self.client.extra_parameters = { 'key': settings.PROAIXY_API_KEY} self.translators = {} # Translator management def add_translator(self, translator): """ Adds the given translator to the paper source, so that we know how to translate papers in the given format. The paper source cannot hold more than one translator per OAI format (it decides what translator to use solely based on the format) so if there is already a translator for that format, it will be overriden. """ self.translators[translator.format()] = translator # Record ingestion def ingest(self, from_date=None, metadataPrefix='any', resumptionToken=None): """ Main method to fill Dissemin with papers! :param from_date: only fetch papers modified after that date in the proxy (useful for incremental fetching) :param metadataPrefix: restrict the ingest for this metadata format """ args = {'metadataPrefix':metadataPrefix} if from_date: args['from_'] = from_date if resumptionToken: args['resumptionToken'] = resumptionToken records = self.client.listRecords(**args) self.process_records(records) def create_paper_by_identifier(self, identifier, metadataPrefix): """ Queries the OAI-PMH proxy for a single paper. :param identifier: the OAI identifier to fetch :param metadataPrefix: the format to use (a translator has to be registered for that format, otherwise we return None with a warning message) :returns: a Paper or None """ record = self.client.getRecord( metadataPrefix=metadataPrefix, identifier=identifier) return self.process_record(record[0], record[1]._map) # Record search utilities def listRecords_or_empty(self, source, *args, **kwargs): """ pyoai raises :class:`NoRecordsMatchError` when no records match, we would rather like to get an empty list in that case. """ try: return source.listRecords(*args, **kwargs) except NoRecordsMatchError: return [] def process_record(self, header, metadata): """ Saves the record given by the header and metadata (as returned by pyoai) into a Paper, or None if anything failed. """ translator = self.translators.get(header.format()) if translator is None: print("Warning: unknown metadata format %s, skipping" % header.format()) return paper = translator.translate(header, metadata) if paper is not None: try: with transaction.atomic(): saved = Paper.from_bare(paper) return saved except ValueError as e: print "Ignoring invalid paper:" print header.identifier() print e def process_records(self, listRecords): """ Save as :class:`Paper` all the records contained in this list """ # check that we have at least one translator, otherwise # it's not really worth trying… if not self.translators: raise ValueError("No OAI translators have been set up: " + "We cannot save any record.") last_report = datetime.now() processed_since_report = 0 for record in listRecords: header = record[0] metadata = record[1]._map self.process_record(header, metadata) # rate reporting processed_since_report += 1 if processed_since_report >= 1000: td = datetime.now() - last_report rate = 'infty' if td.seconds: rate = unicode(processed_since_report / td.seconds) print("current rate: %s records/s" % rate) processed_since_report = 0 last_report = datetime.now()
#------------------------------------------------------------------------------- # Name: module1 # Purpose: # # Author: dd # # Created: 06/05/2014 # Copyright: (c) dd 2014 # Licence: <your licence> #------------------------------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader URL = ' http://www.pubmedcentral.nih.gov/oai/oai.cgi' bla = "set=pmc-open" registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) for record in client.listRecords(metadataPrefix='oai_dc', set='pmc-open'): print(record)
registry = MetadataRegistry() registry.registerReader('marc21', MarcXML) client = Client(URL, registry) start = valid_date(start_date) stop = valid_date(stop_date) # main while start < stop: from_date = start start = start + timedelta(days=1) # increase days one by one until_date = start try: records = client.listRecords(metadataPrefix='marc21', set='SKC', from_=from_date, until=until_date) saverecords(records) except: pass # skipping deleted entries print('Done.') #%% processing mrc to df mrc_to_mrk('C:/Users/User/Desktop/nkp_nkc_2021-04-07.marc', 'C:/Users/User/Desktop/nkp_nkc_2021-04-07.mrk') fiction_types = ['1', 'd', 'f', 'h', 'j', 'p', 'u', '|', '\\'] filter_fiction_type = get_bool('Filter with a fiction type? ')
else: SETSPEC = None registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader(METADATA_PREFIX, oai_dc_reader) client = Client(URL, registry) record_count = 0 deleted_count = 0 if SETSPEC: records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC) else: records = client.listRecords(metadataPrefix=METADATA_PREFIX) for num, record in enumerate(records): record_count += 1 delinfo = '' if record[0].isDeleted(): deleted_count += 1 delinfo = '(deleted)' print '%0.6d %s %s' % (num, record[0].identifier(), delinfo) print ' %s' % ';'.join(record[0].setSpec()) print 'Harvested %s records, of which %s were deleted' % (record_count, deleted_count)
for line in lcf: lang, text = line.rstrip("\r\n").split("\t") if lang == '': lang = None lang_cache[text] = lang lcf.close() label_to_uri = {} # pass 1: convert MARC data to basic RDF oai = Client( 'https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi', registry) #recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2019,05,15)) recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames') for oaipmhrec in recs: convert_record(oaipmhrec) recs = oai.listRecords(metadataPrefix='marc21', set='meetingNames') for oaipmhrec in recs: convert_record(oaipmhrec) # pass 2: convert literal values to resources for prop in (relatedCorporateBody, predecessor, successor, hierarchicalSuperior): for s, o in g.subject_objects(prop): if isinstance(o, Literal): g.remove((s, prop, o)) # remove original res = label_to_uri.get(u"%s" % o, None)
#aka oaijson import sys from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader import simplejson as json import couchdb server = couchdb.Server() db = server['dcat'] URL = 'http://cardinalscholar.bsu.edu/cgi/oai2' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) records = client.listRecords(metadataPrefix='oai_dc') i = 0 for hdr, metadata, _ in records: i = i + 1 print hdr.identifier() print hdr.datestamp() map = metadata.getMap() map.update({'cdmcollection': 'cardinalscholar'}) db.save(map) print 'saved ' + str(i)
def get_names (dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" edm_reader = MetadataReader( fields={ 'objectId': ('textList', record_prefix + '/@rdf:about'), 'spatial': ('textList', record_prefix + '/dcterms:spatial/text()'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc':'http://purl.org/dc/elements/1.1/', 'dcterms':'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm' : 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl' : 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi' : 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' } ) dictnames={} identifier=[] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry ) k = 0 for record in client.listRecords(metadataPrefix='edm' , set= dataname ): output = record[1].getMap() k = k + 1 print(k) if output['spatial'] !=[]: if len(output['spatial']) ==1: if len(output['spatial'][0])>3: if [output['spatial'][0],output['objectId'][0]] not in identifier: identifier.append([output['spatial'][0],output['objectId'][0]]) if output['spatial'][0] not in dictnames.keys(): key = output['spatial'][0] dictnames.setdefault(key,[]) dictnames[key].append(output['objectId'][0]) else: key = output['spatial'][0] dictnames[key].append(output['objectId'][0]) else: for j in range (0,len(output['spatial'])): if len(output['spatial'][j])>3: if [output['spatial'][j],output['objectId'][0]] not in identifier: identifier.append([output['spatial'][j],output['objectId'][0]]) if output['spatial'][j] not in dictnames.keys(): key = output['spatial'][j] dictnames.setdefault(key,[]) dictnames[key].append(output['objectId'][0]) else: key = output['spatial'][j] dictnames[key].append(output['objectId'][0]) #print (identifier) return dictnames
def main(): #RDF graph initialization g = Graph() g.bind("dc", "http://purl.org/dc/elements/1.1/") g.bind("bibo", "http://purl.org/ontology/bibo/") g.bind("foaf", "http://xmlns.com/foaf/0.1/") g.bind("owl", "http://www.w3.org/2002/07/owl#") #OAI2 access initialization registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(HEDATUZ_URL, registry) creator_dict = {} creator_id_count = 1 #print dir(client.listRecords) #Iterate over each record in headatuz database for record in client.listRecords(metadataPrefix='oai_dc'): for item in record: if type(item) == Metadata: item_dict = dict(item.getMap()) ##print item_dict record_creator_list = [] creator_list = item_dict['creator'] #Get record identifier record_id_url = urlparse(item_dict['identifier'][0]) record_id = record_id_url.path.replace('/', '') #Iterate over each creator of the current record for creator in creator_list: creator_orig = creator if creator_orig not in creator_dict.keys(): creator = creator.replace(' ', '%20') creator_params = urllib.urlencode({'query': creator.encode('utf-8')}) req = urllib2.Request('http://viaf.org/viaf/AutoSuggest?' + creator_params) f = urllib2.urlopen(req) try: json_item = simplejson.load(f, strict=False) except Exception as e: print e break #Generate creator id #id_len = len(str(creator_id_count)) #digits = CREATOR_ID_DIGITS - id_len #id_formatter = '%0' + str(digits) + 'd' creator_id = creator_id_count creator_id_count = creator_id_count + 1 #Get results from VIAF (if any) if json_item['result']: viaf_id = json_item['result'][0]['viafid'] #Create new Creator instance creator = Creator(creator_orig, creator_id, viaf_id) else: #Create new Creator instance creator = Creator(creator_orig, creator_id) creator_dict[creator_orig] = creator record_creator_list.append(creator) else: record_creator_list.append(creator_dict[creator_orig]) item_dict['creator'] = record_creator_list item_type_list = item_dict['type'] if type(item_type_list) == list: for item_type in item_type_list: if item_type.encode('utf-8') == 'Artículo': #print 'Articulo' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Article')) elif item_type.encode('utf-8') == 'Sección de Libro': #print 'Seccion' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/BookSection')) elif item_type == u'Libro': #print 'Libro' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Book')) elif item_type == u'PeerReviewed': #print 'Peer' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/ontology/bibo/DocumentStatus', u'http://purl.org/ontology/bibo/status/peerReviewed')) elif item_type.encode('utf-8') == 'Monografía': g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Document')) else: item_type = item_dict['type'] if item_type.encode('utf-8') == 'Artículo': #print 'Articulo' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Article')) elif item_type.encode('utf-8') == 'Sección de Libro': #print 'Seccion' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/BookSection')) elif item_type == u'Libro': #print 'Libro' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Book')) elif item_type == u'PeerReviewed': #print 'Peer' g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/ontology/bibo/DocumentStatus', u'http://purl.org/ontology/bibo/status/peerReviewed')) elif item_type.encode('utf-8') == 'Monografía': g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://purl.org/ontology/bibo/Document')) for key in item_dict: obj = item_dict[key] if type(obj) == list: for creator_item in obj: if key == 'creator': g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/dc/elements/1.1/creator', RDF_DOMAIN + u'resource/author/' + str(creator_item.id))) else: g.add((RDF_DOMAIN + u'resource/biblio/' + record_id, u'http://purl.org/dc/elements/1.1/' + key, Literal(creator_item))) for key in creator_dict.keys(): creator = creator_dict[key] g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', u'http://xmlns.com/foaf/0.1/Person')) g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://xmlns.com/foaf/0.1/name', Literal(creator.name))) if creator.viaf_id != None: g.add((RDF_DOMAIN + u'resource/author/' + str(creator.id), u'http://www.w3.org/2002/07/owl#sameAs', VIAF_URL + creator.viaf_id)) print len(g) #for s, p, o in g: ##print s, p, o f = open('hedatuz.rdf', 'w') f.write(g.serialize(format='pretty-xml')) g.close() f.close()
"""Put record in ElasticSearch""" es.index(index="hhs", doc_type="oai", id=record['id'], body={ "title": record['title'], "url": getUrl(record['url']), "genre": record['genre'], "name": _getNames(record['name']), "language": record['language'], "topics": record['topics'], "abstract": record['abstract'], "date": datestamp, }) for record in client.listRecords(metadataPrefix='mods'): #print record if record[1] is not None: datestamp = record[0].datestamp() record = record[1].getMap() print datestamp, record #print {record['title']}, {record['url'][1]}, record['genre'], ', '.join(record['name']), record['language'], ', '.join(record['topics']), record['abstract'] doc_url = getUrl(record['url']) if doc_url is not None: getThumb(doc_url) #esIndex(record, datestamp) #raw_input("Press Enter to continue...")
#Conecta com o provedor OAI-PMH registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url_provider, registry) print("Conexão estabelecida") sets = client.listSets() #lista os conjuntos print("Conjuntos encontrados") for setSpec, setName, setDescription in sets: #percorre cada conjunto do provedor try: records = client.listRecords( metadataPrefix='oai_dc', set=setSpec) #lista os registros print("Coletando dados do conjunto {}, do provedor {} \n". format(setName, provider_name)) count = 1 for record in records: #percorre os registros header, metadata, about = record if metadata: #getMap return dictonary with all metadata fields doc = metadata.getMap() doc['_id'] = re.sub('[:/.]', '-',
from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader import json URL = 'http://oai.narcis.nl/oai' #?verb=GetRecord&metadataPrefix=oai_dc&identifier=' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) for header, record, other in client.listRecords(metadataPrefix='oai_dc'): if not record: continue datarecord = record.getMap() datarecord['id'] = header.identifier() datarecord['datestamp'] = str(header.datestamp()) print(json.dumps(datarecord))
def retrieval(self, repository): self.logger.info(u'Trying to retrieve url {0}'.format(repository[1]).encode(ENCODE)) registry = MetadataRegistry() registry.registerReader(METADATA, oai_dc_reader) try: client = Client(repository[1], registry) self.logger.info(SEPARATOR) self.logger.info(u'Connection established successfully...') # identify info identify = client.identify() repository_name = identify.repositoryName() repository_name_normalized = re.sub(re.compile(FILE_ESCAPE_CHARS), '', repository_name).strip() \ .replace(' ', '_').lower() base_url = identify.baseURL().encode(ENCODE) protocol_version = identify.protocolVersion().encode(ENCODE) granularity = identify.granularity().encode(ENCODE) compression = identify.compression() deleted_record = identify.deletedRecord().encode(ENCODE) metadata = {'repository_name': repository_name, 'base_url': base_url, 'latest_url': repository[1], 'protocol_version': protocol_version, 'granularity': granularity, 'compression': str(compression).strip('[]'), 'deleted_record': deleted_record} self.logger.info(u'Repository name: {0}'.format(repository_name)) self.logger.info(u'URL connected: {0}'.format(repository[1])) self.logger.info(u'Base URL: {0}'.format(base_url)) self.logger.info(u'Protocol version: {0}'.format(protocol_version)) self.logger.info(u'Granularity: {0}'.format(granularity)) self.logger.info(u'Compression: {0}'.format(compression)) self.logger.info(u'Deleted record: {0}'.format(deleted_record)) records_count = 0 deleted_count = 0 records_list = list() parsed_records_list = list() # we're not interested in all sets, so we must iterate over the ones we have and want to crawl if repository[2] is not None: self.logger.info(u'Fetching set {0}...'.format(repository[2])) records_list = client.listRecords(metadataPrefix=METADATA, set=repository[2]) else: records_list = client.listRecords(metadataPrefix=METADATA) if records_list is not None: for record in records_list: records_count += 1 if record[0].isDeleted(): deleted_count += 1 if record[1] is not None: parsed_records_list.append(tostring(record[1].element())) self.logger.info( u'Retrieved {0} records from set {1} where {2} were deleted'.format(records_count, repository[2], deleted_count)) if not exists(''.join(['files/', repository_name_normalized, '/'])): self.logger.info('Creating storage folder for {0}...'.format(repository_name)) makedirs(''.join(['files/', repository_name_normalized, '/'])) self.logger.info(u'Creating storage files...') meta_file = open(''.join(['files/', repository_name_normalized, '/metadata.xml']), 'w') metadata[repository[2] + '_records_number'] = records_count metadata[repository[2] + '_deleted_number'] = deleted_count meta_file.write(tostring(dict_to_xml('metadata', metadata))) meta_file.close() record_file = open(''.join( ['files/', repository_name_normalized, '/', repository_name_normalized, '_', repository[2], '.xml']), 'w') record_file.write(''.join(parsed_records_list)) record_file.close() except NoRecordsMatchError, nrme: self.logger.error(u'{0} on repository {1}'.format(nrme.message, repository_name)) # add url to unvisited_url and ask retrieval to try to crawl them again if nrme.message == 'No matches for the query': self.unvisited_repository.append(repository)
g.namespace_manager.bind('dc', DC) g.namespace_manager.bind('dct', DCT) if len(sys.argv) != 4: print >>sys.stderr, "Usage: %s <oai-pmh-provider> <set-name> <namespace-URI>" % sys.argv[0] sys.exit(1) provider, setname, urins = sys.argv[1:] metans = urins[:-1] + "-meta/" g.namespace_manager.bind(metans.split('/')[-2], Namespace(metans)) oai = Client(provider, registry) #recs = oai.listRecords(metadataPrefix='marc21', set=setname, from_=datetime(2014,10,1)) recs = oai.listRecords(metadataPrefix='marc21', set=setname) LANGMAP = { 'fin': 'fi', 'swe': 'sv', } # temporary dicts to store label/URI mappings between passes labelmap = {} # key: prefLabel, val: URIRef relationmap = {} # key: prefLabel, val: [ (property, prefLabel), ... ] RELMAP = { # MARC21 control field w value to RDF property + inverse 'g': (SKOS.broader, SKOS.narrower), 'h': (SKOS.narrower, SKOS.broader), # 'a': (DCT.replaces, DCT.isReplacedBy), # 'b': (DCT.isReplacedBy, DCT.replaces),
def oai_metadata(oai_endpoint): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(oai_endpoint, registry) return make_graphs(client.listRecords(metadataPrefix='oai_dc'))
def get_names(dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc) edm_reader = MetadataReader( fields={ 'title': ('textList', record_prefix + '/dc:title/text()'), 'creator': ('textList', record_prefix + '/dc:creator/text()'), 'subject': ('textList', record_prefix + '/dc:subject/text()'), 'description': ('textList', record_prefix + '/dc:description/text()'), 'publisher': ('textList', record_prefix + '/dc:publisher/text()'), 'contributor': ('textList', record_prefix + '/dc:contributor/text()'), 'date': ('textList', record_prefix + '/dc:date/text()'), 'type': ('textList', record_prefix + '/dc:type/text()'), 'format': ('textList', record_prefix + '/dc:format/text()'), 'identifier': ('textList', record_prefix + '/dc:identifier/text()'), 'source': ('textList', record_prefix + '/dc:source/text()'), 'language': ('textList', record_prefix + '/dc:language/text()'), 'relation': ('textList', record_prefix + '/dc:relation/text()'), 'coverage': ('textList', record_prefix + '/dc:coverage/text()'), 'rights': ('textList', record_prefix + '/dc:rights/text()'), 'spatial': ('textList', record_prefix + '/dc:spatial/text()'), 'objectId': ('textList', record_prefix + '/@rdf:about'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm': 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' }) names = [] identifier = [] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry) # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21' for record in client.listRecords(metadataPrefix='edm', set=dataname): output = record[1].getMap() if output['creator'] != []: names.append([output['creator'][0]]) identifier.append( [output['creator'][0], output['objectId'][0]]) if output['contributor'] != []: names.append([output['contributor'][0]]) identifier.append( [output['contributor'][0], output['objectId'][0]]) print(names) return identifier
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client( source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [ meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc') ] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [ Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA ]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn( "Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn( "Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter( name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn( "Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids
es.index( index="hhs", doc_type="oai", id=record['id'], body={ "title": record['title'], "url": getUrl(record['url']), "genre": record['genre'], "name": _getNames(record['name']), "language": record['language'], "topics": record['topics'], "abstract": record['abstract'], "date": datestamp, } ) for record in client.listRecords(metadataPrefix='mods'): #print record if record[1] is not None: datestamp = record[0].datestamp() record = record[1].getMap() print datestamp, record #print {record['title']}, {record['url'][1]}, record['genre'], ', '.join(record['name']), record['language'], ', '.join(record['topics']), record['abstract'] doc_url = getUrl(record['url']) if doc_url is not None: getThumb(doc_url) #esIndex(record, datestamp) #raw_input("Press Enter to continue...")
def harvest(metadata_set, dest_folder, log_file, content_type, from_date, until_date): ############################# # ### FILESYSTEM CHECKS ### # ############################# try: if not os.path.isdir(dest_folder): os.makedirs(dest_folder) # Verify write permission inside the folder: except BaseException as e: log.error(str(e)) log.exit("Unable to create destination folder: %s" % dest_folder) try: test_path = os.path.join(dest_folder, '__test_permissions__') os.makedirs(test_path) os.rmdir(test_path) except BaseException as e: log.error(str(e)) log.exit("Unable to use destination folder: %s" % dest_folder) try: log_handle = open(log_file, 'a+') log_handle.close() except BaseException as e: log.error(str(e)) log.exit("Unable to create log_file: %s" % log_file) ################################# # ### OAI-PMH CONFIGURATION ### # ################################# URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do' metadata_prefix = 'efg' ################################### # ### OPEN OAI-PMH CONNECTION ### # ################################### registry = MetadataRegistry() registry.registerReader(metadata_prefix, oai_dc_reader) #print ("URL=" + str(URL)) client = Client(URL, registry) #################################### # ### CHECK IF THIS SET EXISTS ### # #################################### set_found = False for s in client.listSets(): if metadata_set == s[0]: set_found = True if not set_found: log.exit("Unable to find this set: %s" % metadata_set) ############################# # ### RETRIEVE METADATA ### # ############################# if from_date is not None: from_date = parse_date(from_date) if from_date is None: log.exit("Unable to convert from date") if until_date is not None: until_date = parse_date(until_date) if until_date is None: log.exit("Unable to convert until date") report_data = { 'downloaded': 0, 'filtered': 0, 'saved': 0, 'saved_files': [], 'missing_sourceid': [], 'wrong_content_type': [] } timestamp = int(1000 * time.time()) log.info("Retrieving records for %s..." % metadata_set) try: records = client.listRecords( metadataPrefix=metadata_prefix, set=metadata_set, from_=from_date, until=until_date) except NoRecordsMatchError as e: log.exit(e) log.info("Records retrieved, extracting...") try: for record in records: element = record[1].element() # Obtained eTree is based on namespaced XML # Read: 19.7.1.6. Parsing XML with Namespaces # https://docs.python.org/2/library/xml.etree.elementtree.html # find(match) # Finds the first subelement matching match. # match may be a tag name or path. # Returns an element instance or None. # findall(match) # Finds all matching subelements, by tag name or path. # Returns a list containing all matching elements # in document order. report_data['downloaded'] += 1 if report_data['downloaded'] % 100 == 0: print('.', end='', flush=True) if report_data['downloaded'] % 5000 == 0: print( ' %s downloaded - %s saved' % ( report_data['downloaded'], report_data['saved'] ), flush=True) efgEntity = element.find(tag("efgEntity")) if efgEntity is None: # log.warning("efgEntity not found, skipping record") continue avcreation = efgEntity.find(tag("avcreation")) nonavcreation = efgEntity.find(tag("nonavcreation")) if avcreation is not None: manifestation = avcreation.find(tag("avManifestation")) recordSource = avcreation.find(tag("recordSource")) keywords = avcreation.findall(tag("keywords")) title_el = avcreation.find(tag("identifyingTitle")) title = (title_el.text if title_el is not None else "Unknown title") elif nonavcreation is not None: manifestation = nonavcreation.find(tag("nonAVManifestation")) recordSource = nonavcreation.find(tag("recordSource")) keywords = nonavcreation.findall(tag("keywords")) title_el = nonavcreation.find(tag("title")) title = (title_el.find(tag("text")).text if title_el is not None else "Unknown title") else: title = "Unknown title" # log.warning("(non)avcreation not found, skipping record") continue filter_keyword = "IMediaCities" is_good = False for keyword in keywords: term = keyword.find(tag("term")) if term.text == filter_keyword: is_good = True break if not is_good: continue report_data['filtered'] += 1 if manifestation is None: report_data['missing_sourceid'].append(title) # log.warning("avManifestation not found, skipping record") continue if content_type is not None: content_type = content_type.lower() item = manifestation.find(tag("item")) if item is None: # missing <item> => type cannot be found report_data['wrong_content_type'].append(title) continue item_type = item.find(tag("type")) if item_type is None: # missing <type> report_data['wrong_content_type'].append(title) continue if item_type.text.lower() != content_type: # wrong type report_data['wrong_content_type'].append(title) continue # ATTENZIONE: il sourceID va preso dal recordSource che sta # sotto avcreation/nonavcreation e NON sotto # avManifestation/nonAVManifestation #recordSource = manifestation.find(tag("recordSource")) if recordSource is None: report_data['missing_sourceid'].append(title) # log.warning("recordSource not found, skipping record") continue sourceID = recordSource.find(tag("sourceID")) if sourceID is None: report_data['missing_sourceid'].append(title) # log.warning("sourceID not found, skipping record") continue content = etree.tostring(efgEntity, pretty_print=False) # id_text = urllib.parse.quote_plus(sourceID.text.strip()) # replace non alpha-numeric characters with a dash id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip()) # fine cinzia filename = "%s_%s_%s.xml" % ( metadata_set, id_text, timestamp ) filepath = os.path.join(dest_folder, filename) # with open(filepath, 'wb') as f: with codecs.open(filepath, 'wb', "utf-8") as f: f.write(content.decode('utf-8')) # OLD #with codecs.open(filepath, 'wb', "utf-8") as f: # f.write(html.unescape(content.decode('utf-8'))) report_data['saved'] += 1 report_data['saved_files'].append(filename) except NoRecordsMatchError as e: log.warning("No more records after filtering?") log.warning(e) # ################### # Write report file # ################### # the procedure writes a report file containing the results # of the harvesting: # the list of records that do not contain the record ID # (by writing the content of the element title) with open(log_file, 'w+') as f: json.dump(report_data, f) f.close() # Just to close previous dot line print("") log.info(""" %s records from set [%s] downloaded open log file [%s] for details """ % (report_data['saved'], metadata_set, log_file) )
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0): ''' Pull articles using the Open Archives Initiative protocol subject - String defining the subset of the main section section - String defining the main section (typically physics or nothing) start - A datetime.datetime object restricting the starting date of returned articles end - A datetime.datetime object restricting the ending date of the returned articles sleep_time - A number specifying how many ms to wait between the record queries Examples oaiSpider("hep-ex", "physics") ==> returns all HEP experiment articles oaiSpider("cs", "", datetime(2011,06,24)) ==> returns all computer science articles submitted after June 24th, 2011 oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24)) ==> returns all HEP phenomenology articles submitted before June 24th, 2011 Returns a list of dictionaries containing the article metadata ''' from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader base_url = "http://export.arxiv.org/oai2" output = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() if section == None: section = "" if len(section) > 0 and section[-1] != ":": section += ":" # sets = client.listSets() # for entry in sets: # print entry ### OAIPMH module sucks donkey balls # Causes some error when I use the from_ or until keys records = client.listRecords(metadataPrefix='oai_dc' , set='%s%s' % (section, subject) , from_=start #, from_=datestamp , until=end ) counter = 0 for (header, metadata, aux) in records: print counter # for key in metadata._map.keys(): # print key, metadata[key] output.append({"title" : cleanText(metadata["title"][0]), "abstract" : cleanText(metadata["description"][0]), "date" : convertDate(max(metadata["date"])), "subject" : subject, "url" : metadata["identifier"][0], "authors" : "; ".join( metadata['creator']), }) print output[-1] counter += 1 # break # if counter > 15: # break time.sleep(sleep_time) return output
"""Returns the PyMARC record from the OAI structure for MARC XML""" def __call__(self, element): print element[0][1].text handler = marcxml.XmlHandler() marcxml.parse_xml(StringIO(tostring(element[0])), handler) return handler.records[0] marcxml_reader = MARCXMLReader() # Defining of metadata Readers in the Registry from oaipmh import metadata registry = metadata.MetadataRegistry() registry.registerReader('marc21', marcxml_reader) #### OAI-PMH Client processing oai = Client('http://snape.mzk.cz/OAI-script', registry) recs = oai.listRecords(metadataPrefix='marc21', set='MZK03') for rec in recs: print rec[0].identifier() r = rec[1] # Get pyMARC representation print r['856'] print r['034'] print r['008'] print
registry = metadata.MetadataRegistry() registry.registerReader("marc21", marcxml_reader) g = Graph() g.namespace_manager.bind("skos", SKOS) g.namespace_manager.bind("cn", CN) g.namespace_manager.bind("dc", DC) g.namespace_manager.bind("dct", DCT) g.namespace_manager.bind("rdaa", RDAA) g.namespace_manager.bind("rdac", RDAC) oai = Client("https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi", registry) # recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2013,1,1)) recs = oai.listRecords(metadataPrefix="marc21", set="corporateNames") lang_cache = {} lcf = codecs.open(LANG_CACHE_FILE, "r", "utf-8") for line in lcf: lang, text = line.rstrip("\r\n").split("\t") if lang == "": lang = None lang_cache[text] = lang lcf.close() label_to_uri = {} def guess_language(text): """return the most likely language for the given unicode text string"""
def indexCollection(URL, url_base, metadata_prefix, collection, action): #pull data from OAI endpoint registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry, force_http_get=True) harvested_data = [] for record in client.listRecords(metadataPrefix=metadata_prefix, set=collection): if not record[0].isDeleted(): fields = record[1].getMap() if fields['subject']: fields['subjects'] = fields['subject'][0].split(';') del fields['subject'] fields['set'] = record[0].setSpec() identifier = record[0].identifier().split(':')[2] fields[ 'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/' harvested_data.append(fields) if action is 'reindex': es.indices.delete(index='digital_collection_recs', ignore=[400, 404]) mapping = { "mappings": { "_doc": { "properties": { "title": { "type": "text" }, "creator": { "type": "text" }, "subjects": { "type": "text" }, "description": { "type": "text" }, "publisher": { "type": "text" }, "contributor": { "type": "text" }, "date": { "type": "text" }, "type": { "type": "text", "fielddata": "true" }, "format": { "type": "text", "fielddata": "true" }, "identifier": { "type": "text" }, "source": { "type": "text" }, "language": { "type": "text", "fielddata": "true" }, "relation": { "type": "text" }, "coverage": { "type": "text" }, "rights": { "type": "text" }, "set": { "type": "text", "fielddata": "true" }, "image_url_base": { "type": "text" } } } } } es.indices.create(index='digital_collection_recs', body=mapping) helpers.bulk(es, harvested_data, index='digital_collection_recs', doc_type='_doc') return "success"
# arXiv OAI url we will query URL = "http://export.arxiv.org/oai2" # Create OAI client; now we're all set for listing some records client = Client(URL, registry) # Open files for writing titlef = open(title_file, 'w') #abstractf = open(abstr_file, 'w') # Keep track of run-time and number of papers start_time = time.time() count = 0 # Harvest for record in client.listRecords(metadataPrefix='oai_dc', set=section): try: # Extract the title title = record[1].getField('title')[0] # Extract the abstract abstract = record[1].getField('abstract')[0] # And get the date (this is stored as yyyy-mm-dd in the arXiv metadata) date = record[1].getField('date')[0] year = int(date[0:4]) month = int(date[5:7]) # Write to file (add year info to the titles) titlef.write("%d %d " % (year, month) + title + "\n") # abstractf.write(abstract + "\n") count += 1
from rdflib import URIRef, Graph, Literal, Namespace from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader easy_id = Namespace('https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:') def easy_url(oai_id): namespace, dataset = oai_id.rsplit(':', 1) if namespace != 'oai:easy.dans.knaw.nl:easy-dataset': raise(Exception("Unknown namespace: {0}".format(namespace))) return easy_id[dataset] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client('http://easy.dans.knaw.nl/oai/', registry) graph = Graph() graph.namespace_manager.bind('dc11', 'http://purl.org/dc/elements/1.1/') dc11 = Namespace('http://purl.org/dc/elements/1.1/') # max_count = 30000 for count, (header, metadata, _) in enumerate(client.listRecords(metadataPrefix='oai_dc')): # if count >= max_count: # break if metadata is not None: metadata_fields = metadata.getMap().iteritems() s = easy_url(header.identifier()) for p, vv in metadata_fields: for v in vv: graph.add((s, dc11[p], Literal(v))) graph.serialize('easy-lod.nt', format='nt')
#!/usr/bin/env python # Dependencies # pioai - OAI-PMH Python Module - http://infrae.com/download/OAI/pyoai from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader oaiSourceURL = 'http://digitalrepository.unm.edu/do/oai/' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(oaiSourceURL, registry) with open("output.txt","w") as outfile: for record in client.listRecords(metadataPrefix='oai_dc', max=10): outfile.write(repr(record)+"\n")