def _sru_lookup(self): for server in self.SRU_SERVERS.keys(): if not server.find('GGC') > -1: schema = 'mods' else: schema = 'dcx' url = server % (self.isbn, schema) data = self.get(url) if data == False: return (False) data = lxml_lib.fromstring(data) found = int( data.find( "{http://www.loc.gov/zing/srw/}numberOfRecords").text) if found > 0: data = data.find("{http://www.loc.gov/zing/srw/}records").find( "{http://www.loc.gov/zing/srw/}record").find( "{http://www.loc.gov/zing/srw/}recordData") if schema == 'dcx': data = data.find("{info:srw/schema/1/dc-v1.1}dc") else: data = data.find("{http://www.loc.gov/mods/v3}mods") for item in data.getchildren(): if len(item.getchildren()) > 0: for name in item.getchildren(): if len(name.text.strip()) > 0: nname = nname = item.tag.split( '}')[1] + "_" + name.tag.split('}')[1] self._add("sru", name.text, nname) if len(item.text.strip()) > 0: self._add("sru", item.text, item.tag.split('}')[1])
def _worldcat_lookup(self): url = self.worldcat % (self.isbn, self.worldcat_key) data = url_cache.getURLdata(url) data = lxml_lib.fromstring(data) self.data["worldcat"] = {} if int( data.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults' ).text) > 0: for item in data.find( '{http://www.w3.org/2005/Atom}entry').getchildren(): if item.text: if len(item.text.strip()) > 0: if not item.tag.split( '}')[1] in self.data["worldcat"].keys(): self.data["worldcat"][item.tag.split('}')[1]] = {} self.data["worldcat"][item.tag.split('}')[1]][ item.text] = True if len(item.getchildren()) > 0: for name in item.getchildren(): if name.text: if len(name.text.strip()) > 0: nname = item.tag.split( '}')[1] + "_" + name.tag.split('}')[1] if not nname in self.data["worldcat"].keys(): self.data["worldcat"][nname] = {} self.data["worldcat"][nname][name.text] = True
def _sru_lookup(self): for server in self.SRU_SERVERS.keys(): if not server.find('GGC') > -1: schema = 'mods' else: schema = 'dcx' url=server% (self.isbn, schema) data=self.get(url) if data == False: return(False) data=lxml_lib.fromstring(data) found = int(data.find("{http://www.loc.gov/zing/srw/}numberOfRecords").text) if found>0: data=data.find("{http://www.loc.gov/zing/srw/}records").find("{http://www.loc.gov/zing/srw/}record").find("{http://www.loc.gov/zing/srw/}recordData") if schema =='dcx': data = data.find("{info:srw/schema/1/dc-v1.1}dc") else: data = data.find("{http://www.loc.gov/mods/v3}mods") for item in data.getchildren(): if len(item.getchildren()) > 0: for name in item.getchildren(): if len(name.text.strip()) > 0: nname=nname=item.tag.split('}')[1]+"_"+name.tag.split('}')[1] self._add("sru", name.text, nname) if len(item.text.strip())>0: self._add("sru", item.text, item.tag.split('}')[1])
def _sameas(self, sameas): if sameas.startswith('http'): url = self.URL_SAMEAS % sameas sameas = sameas.replace('http%3A//', 'http://') data = self.get(url) if type(data) == list: for item in data[0]["duplicates"]: if item.find('http://rdf.freebase.com/ns/') > -1: item=item.replace('http://rdf.freebase.com/ns/', 'http://data.kbresearch.nl/fb:/').replace('en.','en/').replace('guid.','guid/') self._add(sameas, item.replace('http://dbpedia.org/resource/','http://data.kbresearch.nl/DBP:'), "same") else: return(False) elif isbn_check.isValid(sameas): data = self.get(self.URL_SAMEAS_ISBN % sameas) xml = lxml_lib.fromstring(data) for item in xml.iterchildren(): isbn = str(item.text) if not sameas == isbn and not sameas == isbn_check.convert(isbn): self._add(sameas, "http://data.kbresearch.nl/isbn:"+isbn, "same") else: url = self.URL_SAMEASQ % sameas data = self.get(url) if type(data) == list: for item in data[0]["duplicates"]: if item.find('http://rdf.freebase.com/ns/') > -1: item=item.replace('http://rdf.freebase.com/ns/', 'http://data.kbresearch.nl/fb:/').replace('en.','en/').replace('guid.','guid/') self._add(sameas, item.replace('http://dbpedia.org/resource/','http://data.kbresearch.nl/DBP:'), "same")
def _sameas(self): data = self.get(self.URL_THING % self.isbn) xml = lxml_lib.fromstring(data) for item in xml.iterchildren(): isbn = str(item.text) if not self.isbn == isbn and not self.isbn == isbn_check.convert(isbn): self._add("librarything_sameas", "http://data.kbresearch.nl/isbn:"+isbn, "isbn")
def _ryerson_lookup(self): data = self.get(self.URL_RYERSON % self.isbn) if type(data) == bool: return (False) self["ryerson"] = {} data = lxml_lib.fromstring(data) for item in data.findall('record'): for name in item.getchildren(): if name.text: if len(name.text.strip()) > 0: self._add("ryerson", name.text, name.tag)
def _ryerson_lookup(self): data=self.get(self.URL_RYERSON % self.isbn) if type(data) == bool: return(False) self["ryerson"] = {} data=lxml_lib.fromstring(data) for item in data.findall('record'): for name in item.getchildren(): if name.text: if len(name.text.strip()) > 0: self._add("ryerson",name.text.replace('&zoom=5', '&zoom=1'), name.tag.replace('_url', ''))
def getSRUdata(self, cisbn): cisbn=self.fix_isbn(cisbn) if (isbn.isValid(cisbn)): for server in self.SRU_servers.keys(): data=url_cache.getURLdata(server %(cisbn,"dc")) if data: data=lxml_lib.fromstring(data) for child in data.getchildren(): if (child.tag.endswith("numberOfRecords")): if (string.atoi(child.text)) > 0: return(self._parse_dc_awnser(data, cisbn)) break return(False)
def _isbndb_lookup(self): data = url_cache.getURLdata(url) data = lxml_lib.fromstring(data) self.data["isbndb"] = {} try: if data.find('BookList').find('BookData'): for item in data.find('BookList').find('BookData').getchildren(): if not item.tag in self.data["isbndb"].keys(): self.data["isbndb"][item.tag]={} self.data["isbndb"][item.tag][item.text] = True #self._add(dbpedia_identifier, val, nns) except: pass
def _worldcat_lookup(self): url = self.worldcat %(self.isbn, self.worldcat_key) data=url_cache.getURLdata(url) data=lxml_lib.fromstring(data) self.data["worldcat"] = {} if int(data.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults').text) > 0: for item in data.find('{http://www.w3.org/2005/Atom}entry').getchildren(): if item.text: if len(item.text.strip()) > 0: if not item.tag.split('}')[1] in self.data["worldcat"].keys(): self.data["worldcat"][item.tag.split('}')[1]] = {} self.data["worldcat"][item.tag.split('}')[1]][item.text] = True if len(item.getchildren()) > 0: for name in item.getchildren(): if name.text: if len(name.text.strip()) > 0: nname = item.tag.split('}')[1]+"_"+name.tag.split('}')[1] if not nname in self.data["worldcat"].keys(): self.data["worldcat"][nname] = {} self.data["worldcat"][nname][name.text] = True
def loop_once(self): if self.resumptionToken: url = self.serverURL + "?verb=ListIdentifiers&resumptionToken=" + self.resumptionToken else: if self.fromDate: url = self.serverURL + "?verb=ListIdentifiers&metadataPrefix=" + self.metadataPrefix + "&from=" + self.fromDate + "&set=" + self.setName else: url = self.serverURL + "?verb=ListIdentifiers&metadataPrefix=" + self.metadataPrefix + "&set=" + self.setName if DEBUG: print(url) try: data = urllib.urlopen(url).read() except: sys.stdout.write("Could not read data from " + url) os._exit(-1) data = lxml_lib.fromstring(data) add = True deleted = 0 added = 0 for item in data.iter(): if item.tag.find("resumptionToken") > -1 and item.text and ( added > 0 or deleted > 0): self.resumptionToken = item.text self.mongoDB[self.setName].save({ "_id": "resumptionToken", "resumptionToken": item.text }) if DEBUG: print(added, deleted) return () if "status" in item.attrib: if item.attrib["status"] == "deleted": add = False if item.tag.find("identifier") > -1: if add: """ toMongo """ doc = {} doc = { "id": item.text, "status": "new", "_id": hashlib.md5(item.text).hexdigest() } record = self.mongoDB[self.setName].find_one( {"_id": hashlib.md5(item.text).hexdigest()}) if record: """record is allready there so don't mess with it""" pass else: self.mongoDB[self.setName].insert(doc) added += 1 else: """ deletefromMongo """ delete = self.mongoDB[self.setName].find_one( {"_id": hashlib.md5(item.text).hexdigest()}) if delete: if delete["status"] == "new" or delete[ "status"] == "done": self.mongoDB[self.setName].delete( {"_id": hashlib.md5(item.text).hexdigest()}) add = True deleted += 1 self.mongoDB[self.setName].save({ "_id": "resumptionToken", "resumptionToken": False }) self.mongoDB[self.setName].save({ "_id": "from", "from": datetime.datetime.now().strftime("%Y-%m-%d") }) self.resumptionToken = False