def insertAll(time, time2): registry = MetadataRegistry() registry.registerReader('arXivRaw', arXivRaw_reader) client = Client(URL, registry) client.updateGranularity() list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2) errors = 0 for a in list: #a = list.next() try: title = '\n'.join(a[1]['title']) sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ') abstract = '\n'.join(a[1]['abstract']) url = 'http://arxiv.org/abs/' + a[1]['id'][0] date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z') authors = a[1]['authors'][0]# '; '.join(a[1]['keynames']) abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2) print title print sr2 print abstract print url print date print authors insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2) except: print 'ERROR' print a errors = errors+1 print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0): base_url = "http://export.arxiv.org/oai2" output = list() registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end) for _, md, _ in records: # print md.getField("title") # checks for the case in 2010 when there is no title for something if md is not None: txt_dict = {"title": md["title"], "abstract": md["description"], "date": md["date"], "subject": md["subject"], "url": md["identifier"], "authors": md['creator']} output.append(txt_dict) time.sleep(sleep_time) return output
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, metadata_registry) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def now(): return datetime.now().ctime() print >>sys.stderr, "beginning @", now() URL = "http://citeseerx.ist.psu.edu/oai2" registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) client.updateGranularity() store = Store() if len(sys.argv) > 1: start = datetime.strptime(sys.argv[1], '%Y-%m-%d') #2011-10-27, for instance elif store.last(): start = store.last() else: start = client.identify().earliestDatestamp() #try this and see if it works; if it does resumption tokens right, this should work fine. chunk = timedelta(days=1) oneday = timedelta(days=1)
def run(self): # Check that ElasticSearch is alive self.check_index() # If the user specified the --REBUILD flag, recreate the index if self.options['rebuild']: self.rebuild_index() # Connect to the repository registry = MetadataRegistry() registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"]) client = Client(self.settings["uri"], registry) identity = client.identify() print "Connected to repository: %s" % identity.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Initialise some variables batcher = Batch.Batch() total_records = 0 start = time.time() # Now do the synchonisation # If the user specified an identifier, then synchronise this record if (self.options['identifier'] is not None): total_records += self.synchronise_record(client, batcher, self.options['identifier']) else: # Else, synchronise using the date-range provided by the user, or failing that, # the date-range based on the last sync # Get the synchronisation config record synchronisation_config = self.get_synchronisation_config() if self.options["from_date"] is not None: # If the user specified a from-date argument, use it from_date = self.options["from_date"] # already a date (not a datetime) elif synchronisation_config is not None and "to_date" in synchronisation_config: # Else read the last synchronised to_date from the config, and add on a day from_date = dateutil.parser.parse(synchronisation_config["to_date"]).date() + timedelta(days=1) else: # Else use the default_from_date in the config from_date = dateutil.parser.parse(self.settings['default_from_date']).date() if self.options["to_date"] is not None: to_date = self.options["to_date"] # already a date (not a datetime) else: to_date = (date.today() - timedelta(days=1)) # Force the from_date to use time 00:00:00 from_date = datetime.combine(from_date, _time(hour=0, minute=0, second=0, microsecond=0)) # Force the to_date to use time 23:59:59 to_date = datetime.combine(to_date, _time(hour=23, minute=59, second=59, microsecond=0)) print "Synchronising from %s - %s" % (from_date, to_date) while from_date < to_date: next_date = datetime.combine(from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0)) number_of_records = self.synchronise_period(client, batcher, from_date, next_date) batcher.clear() #Store the records in elasticsearch self.put_synchronisation_config(from_date, next_date, number_of_records) from_date += timedelta(days=(self.settings['delta_days'])) total_records += number_of_records # Pause so as not to get banned. to = 20 print "Sleeping for %i seconds so as not to get banned." % to time.sleep(to) # Store the records in the index batcher.clear() # Print out some statistics time_spent = time.time() - start print 'Total time spent: %d seconds' % (time_spent) if time_spent > 0.001: # careful as its not an integer print 'Total records synchronised: %i records (%d records/second)' % (total_records, (total_records/time_spent)) else: print 'Total records synchronised: %i records' % (total_records) return total_records sys.exit()
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0): ''' Pull articles using the Open Archives Initiative protocol subject - String defining the subset of the main section section - String defining the main section (typically physics or nothing) start - A datetime.datetime object restricting the starting date of returned articles end - A datetime.datetime object restricting the ending date of the returned articles sleep_time - A number specifying how many ms to wait between the record queries Examples oaiSpider("hep-ex", "physics") ==> returns all HEP experiment articles oaiSpider("cs", "", datetime(2011,06,24)) ==> returns all computer science articles submitted after June 24th, 2011 oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24)) ==> returns all HEP phenomenology articles submitted before June 24th, 2011 Returns a list of dictionaries containing the article metadata ''' from oaipmh.client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader base_url = "http://export.arxiv.org/oai2" output = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() if section == None: section = "" if len(section) > 0 and section[-1] != ":": section += ":" # sets = client.listSets() # for entry in sets: # print entry ### OAIPMH module sucks donkey balls # Causes some error when I use the from_ or until keys records = client.listRecords(metadataPrefix='oai_dc' , set='%s%s' % (section, subject) , from_=start #, from_=datestamp , until=end ) counter = 0 for (header, metadata, aux) in records: print counter # for key in metadata._map.keys(): # print key, metadata[key] output.append({"title" : cleanText(metadata["title"][0]), "abstract" : cleanText(metadata["description"][0]), "date" : convertDate(max(metadata["date"])), "subject" : subject, "url" : metadata["identifier"][0], "authors" : "; ".join( metadata['creator']), }) print output[-1] counter += 1 # break # if counter > 15: # break time.sleep(sleep_time) return output
class OpenBeeldenDataLoader(DataLoader): def __init__(self): self.ES_INDEX = 'et_openbeelden' self.ES_DOC_TYPE = 'mediaresource' self.es_local = Elasticsearch(host=LTV_ES_SETTINGS['host'], port=LTV_ES_SETTINGS['port']) def loadMediaResourceData(self, resourceUri, clientIP, loadAnnotations): mediaResource = MediaResource(resourceUri) #load the annotations (only named entities in this case) mediaResource = self.__getAllAnnotationsOfResource(mediaResource) #fetch the video metadata mediaResource = self.__getAllVideoMetadata(mediaResource, clientIP) #transform the mediaresource object to JSON and return it resp = simplejson.dumps(mediaResource, default=lambda obj: obj.__dict__) return resp def loadMediaResources(self, provider):#ignores provider return self.loadOpenBeeldenItemsFromES(0, []) def loadOpenBeeldenItemsFromES(self, offset, videos): query = { "query": { "match_all": {} }, "fields": [], "from": offset, "size": 300 } resp = self.es_local.search(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, body=query, timeout="10s") if resp and len(resp['hits']['hits']) > 0: print len(resp['hits']['hits']) vids = [] for hit in resp['hits']['hits']: vid = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=hit['_id']) vids.append(vid['_source']) for vd in vids: video = { 'id' : vd['id'].replace(':', '_'), 'title' : '; '.join(vd['title']), 'date' : '; '.join(vd['date']), 'locator' : self.__getMediumByExtension(vd['medium'], 'mp4'), 'thumbUrl' : self.__getMediumByExtension(vd['medium'], 'png'), 'thumbBaseUrl' : '' } videos.append(video) self.loadOpenBeeldenItemsFromES(offset + 300, videos) return {'videos' : videos} def __getMediumByExtension(self, mediums, extension): poster = None for m in mediums: if m.find('.%s' % extension) != -1: poster = m break return poster def __getAllAnnotationsOfResource(self, mediaResource): nes = [] """ nes.append(NamedEntity( label, entityType=LinkedTVDataUtils.getNEType(DCType, RDFType, OWLSameAs), subTypes=LinkedTVDataUtils.getDCTypes(DCType), disambiguationURL=OWLSameAs, start=start, end=end, annotationURI=annotationURI, relevance=r, confidence=c ) ) """ mediaResource.setNamedEntities(nes) return mediaResource def __getAllVideoMetadata(self, mediaResource, clientIP): print mediaResource.getId() vd = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=mediaResource.getId().replace('_', ':')) if vd: vd = vd['_source'] mediaResource.setVideoMetadata(vd) mediaResource.setPlayoutUrl(self.__getMediumByExtension(vd['medium'], 'mp4')) #set the video metadata in the mediaresource mediaResource.setTitle('; '.join(vd['title'])) mediaResource.setDate('; '.join(vd['date'])) mediaResource.setThumbBaseUrl(None) mediaResource.setSrtUrl(None) mediaResource.setSubtitles(None) return mediaResource def setupOAIPMHConnection(self): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extent/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()') }, namespaces={ 'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/', 'oi': 'http://www.openbeelden.nl/oai/' } ) URL = 'http://www.openbeelden.nl/feeds/oai/' #Initialize the OAI client self.registry = MetadataRegistry() self.registry.registerReader('oai_oi', oai_oi_reader) self.client = Client(URL, self.registry) #Test if the connection to the OAI-PMH provider works x = self.client.updateGranularity() x = self.client.identify() print 'identity %s' % x.repositoryName() print 'identity %s' % x.protocolVersion() print 'identity %s' % x.baseURL() """ for s in client.listSets(): print s """ #initialize the OpenSKOSHandler self.openSKOSHandler = OpenSKOSHandler() def reindex(self, provider = None): setupOAIPMHConnection() i = 0 extent = None item = None identifier = None for rec in self.client.listRecords(metadataPrefix=u'oai_oi', set=u'beeldengeluid'):#stichting_natuurbeelden, beeldengeluid header, metadata, about = rec extent = metadata.getField('extent')[0] item = { 'id' : header.identifier(), 'identifier' : self.getFieldData(metadata, 'identifier'), 'title' : self.getFieldData(metadata, 'title'), 'alternative' : self.getFieldData(metadata, 'alternative'), 'creator' : self.getFieldData(metadata, 'creator'), 'subject' : self.getFieldData(metadata, 'subject'), 'description' : self.getFieldData(metadata, 'description'), 'abstract' : self.getFieldData(metadata, 'abstract'), 'publisher' : self.getFieldData(metadata, 'publisher'), 'contributor' : self.getFieldData(metadata, 'contributor'), 'date' : self.getFieldData(metadata, 'date'), 'date2' : header.datestamp(), 'type' : self.getFieldData(metadata, 'type'), 'extent' : extent, 'medium' : self.getFieldData(metadata, 'medium'), 'source' : self.getFieldData(metadata, 'source'), 'language' : self.getFieldData(metadata, 'language'), 'references' : self.getFieldData(metadata, 'references'), 'spatial' : self.getFieldData(metadata, 'spatial'), 'attributionName' : self.getFieldData(metadata, 'attributionName'), 'attributionURL' : self.getFieldData(metadata, 'attributionURL'), 'license' : self.getFieldData(metadata, 'license'), 'durationSecs' : self.getExtentInSeconds(extent) } self.es_local.index(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=header.identifier(), body=item) print 'Done' return True def getGTAATermsBySubjects(self, subject, spatial): """Get the GTAA terms related to the subject""" gtaaTerms = self.getGTAATermsBasedOnSubjectAndLocation(subject, spatial) """If there is no identifier, try to fetch the taakID from iMMix ES""" if identifier == '' and source != '': print 'No taakID!' taakID = self.getTaakIDBasedOnSource(source) if taakID: print 'assigning taakID to the identifier' identifier = taakID return gtaaTerms def getFieldData(self, metadata, fn): #return '; '.join(metadata.getField(fn)) return metadata.getField(fn) def getExtentInSeconds(self, ext): secs = 0 if ext and ext.find('PT') != -1: ext = ext[2:len(ext)] if ext.find('H') != -1: secs = int(ext[0:ext.find('H')]) * 3600 ext = ext[ext.find('H') + 1:len(ext)] if ext.find('M') != -1: secs = int(ext[0:ext.find('M')]) * 60 ext = ext[ext.find('M') + 1:len(ext)] if ext.find('S') != -1: secs += int(ext[0:ext.find('S')]) return secs def secsToTimeString(self, secs): h = m = s = 0 while secs - 3600 >= 0: h += 1 secs -= 3600 while secs - 60 >= 0: m += 1 secs -= 60 return '%d:%d:%d' % (h, m, s) #Run de hoofdfunctie def getGTAATermsBasedOnSubjectAndLocation(self, subject, spatial): subs = None locs = None os_res = None gtaaExact = [] gtaaFuzzy = [] """First add GTAA terms based on the subject(s)""" if subject: subs = subject.split(';') for s in subs: os_res = self.openSKOSHandler.autoCompleteTable(s) if os_res: if len(os_res) == 1: gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value'])) elif len(os_res) > 1: for r in os_res: gtaaFuzzy.append('%s,%s' % (r['label'], r['value'])) """Append the GTAA terms based on the location(s)""" if spatial: locs = spatial.split(';') for l in locs: os_res = self.openSKOSHandler.autoCompleteTable(l, 'http://data.beeldengeluid.nl/gtaa/GeografischeNamen') if os_res: if len(os_res) == 1: gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value'])) elif len(os_res) > 1: for r in os_res: gtaaFuzzy.append('%s,%s' % (r['label'], r['value'])) return (gtaaExact, gtaaFuzzy) def getImmixMetadataBasedOnDrager(self, drager): global tot query = {"query":{"bool":{"must":[{"query_string":{"default_field":"positie.dragernummer","query":"\"%s\"" % drager}}],"must_not":[],"should":[]}}} #print query resp = es_local.search(index="search_expressie", doc_type="searchable_expressie", body=query, timeout="10s") #print resp if resp and resp['hits']['total'] == 1: for hit in resp['hits']['hits']: return hit elif resp and resp['hits']['total'] > 1: print 'more than one hit...' print resp return None def getTaakIDBasedOnSource(self, source): dragernrs = str(source).split('; ') drager = None """Get the drager from the source (sometimes there are two, but most of the times they are the same)""" if len(dragernrs) == 2: if dragernrs[0] != dragernrs[1]: print dragernrs print '>>>>>>>>>> There are two dragers...' else: drager = dragernrs[0] else: drager = dragernrs[0] """Try to find the taakID related to the drager""" if drager: md = self.getImmixMetadataBasedOnDrager(drager) if md: taakID = md['_source']['expressie']['niveau']['taakID'] if taakID: print 'Found a taakID: %s\t%s' % (drager, taakID) return taakID return None
def run(self): # Check that ElasticSearch is alive self.check_index() # If the user specified the --REBUILD flag, recreate the index if self.options['rebuild']: self.rebuild_index() # Connect to the repository registry = MetadataRegistry() registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"]) client = Client(self.settings["uri"], registry) identity = client.identify() print "Connected to repository: %s" % identity.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Initialise some variables batcher = Batch.Batch() total_records = 0 start = time.time() # Now do the synchonisation # If the user specified an identifier, then synchronise this record if (self.options['identifier'] is not None): total_records += self.synchronise_record( client, batcher, self.options['identifier']) else: # Else, synchronise using the date-range provided by the user, or failing that, # the date-range based on the last sync # Get the synchronisation config record synchronisation_config = self.get_synchronisation_config() if self.options["from_date"] is not None: # If the user specified a from-date argument, use it from_date = self.options[ "from_date"] # already a date (not a datetime) elif synchronisation_config is not None and "to_date" in synchronisation_config: # Else read the last synchronised to_date from the config, and add on a day from_date = dateutil.parser.parse( synchronisation_config["to_date"]).date() + timedelta( days=1) else: # Else use the default_from_date in the config from_date = dateutil.parser.parse( self.settings['default_from_date']).date() if self.options["to_date"] is not None: to_date = self.options[ "to_date"] # already a date (not a datetime) else: to_date = (date.today() - timedelta(days=1)) # Force the from_date to use time 00:00:00 from_date = datetime.combine( from_date, _time(hour=0, minute=0, second=0, microsecond=0)) # Force the to_date to use time 23:59:59 to_date = datetime.combine( to_date, _time(hour=23, minute=59, second=59, microsecond=0)) print "Synchronising from %s - %s" % (from_date, to_date) while from_date < to_date: next_date = datetime.combine( from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0)) number_of_records = self.synchronise_period( client, batcher, from_date, next_date) batcher.clear() #Store the records in elasticsearch self.put_synchronisation_config(from_date, next_date, number_of_records) from_date += timedelta(days=(self.settings['delta_days'])) total_records += number_of_records # Pause so as not to get banned. to = 20 print "Sleeping for %i seconds so as not to get banned." % to time.sleep(to) # Store the records in the index batcher.clear() # Print out some statistics time_spent = time.time() - start print 'Total time spent: %d seconds' % (time_spent) if time_spent > 0.001: # careful as its not an integer print 'Total records synchronised: %i records (%d records/second)' % ( total_records, (total_records / time_spent)) else: print 'Total records synchronised: %i records' % (total_records) return total_records sys.exit()