def download_book(Id, id_for_key): Id_raw = sanitize_id_to_url(Id, action = 'desanitize') url = "http://www.brasiliana.usp.br/bbd/handle/%s" %Id_raw r = requests.get(url) source = r.text pdf_url = extract_downloadURL(source) pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_br_%s.pdf" %Id ### store_output_file_name(id_for_key, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) url = 'https://www.googleapis.com/books/v1/volumes/%s?key=%s' %(Id, key) r = requests.get(url, headers={'referer': "tools.wmflabs.org/bub"} ) if r.status_code == 404: return 1 if r.status_code == 403: return 7 if r.status_code != 200: return 10 book_info = r.json() if book_info['accessInfo']['viewability'] != "ALL_PAGES": return 2 keys1 = book_info['volumeInfo'].keys() return dict( image_url = book_info['volumeInfo']['imageLinks']['small'] if 'small' in book_info['volumeInfo']['imageLinks'].keys() else "", thumbnail_url = book_info['volumeInfo']['imageLinks']['thumbnail'] if 'thumbnail' in book_info['volumeInfo']['imageLinks'].keys() else "", printType = book_info['volumeInfo']['printType'] if 'printType' in book_info['volumeInfo'].keys() else "", title = book_info['volumeInfo']['title'] if 'title' in keys1 else "", subtitle = book_info['volumeInfo']['subtitle'] if 'subtitle' in keys1 else "", author = book_info['volumeInfo']['authors'][0] if 'authors' in keys1 else "", publisher = book_info['volumeInfo']['publisher'] if 'publisher' in keys1 else "", publishedDate = book_info['volumeInfo']['publishedDate'] if 'publishedDate' in keys1 else "", description = re.sub('<[^<]+?>', '', book_info['volumeInfo']['description']) if 'description' in keys1 else "", infoLink = book_info['volumeInfo']['infoLink'] if 'infoLink' in keys1 else "", publicDomain = book_info['accessInfo']['publicDomain'] if 'publicDomain' in book_info['accessInfo'].keys() else "", language = book_info['volumeInfo']['language'] if 'language' in book_info['volumeInfo'].keys() else "", scanner = "google", sponser = "Google" )
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 unique_id = re.search('(.+)_(\d+)', Id) if not unique_id: return 1 collection, identifier = unique_id.group(1), unique_id.group(2) try: r = requests.get( "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" % (collection, identifier)) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) if 'Text' not in get_meta('metadata_object_type', soup): return 10 else: return 0
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'gal', Id_string) library_url_key = book_key + ":library_url" url = "http://gallica.bnf.fr/%s" %(Id) try: r = requests.get(url) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) rights= OAI_metadata_content("DC.rights", soup, burst=True) for i in rights: if not rights: continue if i.strip().lower().encode('utf-8') in ('domaine public', 'public domain'): return 0 return 2
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) r = requests.get("http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" %Id) if r.status_code != 200: return 10 else: book_info = r.json() items = book_info["items"] records = book_info["records"][book_info['records'].keys()[0]] if items == []: return 10 if items[0]["usRightsString"] != "Full view": return 2 xml = records["marc-xml"] soup = BeautifulSoup(xml) htid = items[0]["htid"] return dict( image_url = "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=300" %htid, thumbnail_url = "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=128" %htid, printType = "BOOK", title = records['titles'][0], subtitle = "", author = extract_author(soup), publisher = extract_publisher(soup), publishedDate = records["publishDates"][0] if "publishDates" in records.keys() else "", description = records['titles'][0] if "titles" in records.keys() else "", infoLink = records["recordURL"] if "recordURL" in records.keys() else "", publicDomain = True if items[0]["rightsCode"] in ("pd", "pdus") else "", language = extract_language(soup), scanner = "Hathitrust", sponser = "HathiTrust" )
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 #else: tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url == False: return 8 return 0
def extract_total_pages(Id_raw): r = requests.get('http://gallica.bnf.fr/%s' %Id_raw) no= re.findall('Nombre total de vues : ([\d]+)', r.text) if len(no) >0: return int(no[0]) else: return None
def get_id_from_record_key(Id): """Extract and return htid associated with the book""" r = requests.get( "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/%s.json" % Id) book_info = r.json() htid = book_info["items"][0]["htid"] return htid
def download_book(Id, id_for_key): """Download book images from HathiTrust and tar them to one file""" Id = get_id_from_string(Id) unique_id = re.search('(.+)_(\d+)', Id) collection, identifier = unique_id.group(1), unique_id.group(2) url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" % ( collection, identifier) r = requests.get(url) source = r.text soup = BeautifulSoup(source) file_type = 'pdf' if 'pdf' in get_meta('metadata_tipus', soup) else 'image' if file_type == 'pdf': tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = "http://mdc.cbuc.cat/utils/getfile/collection/%s/id/%s/filename/1.pdf" % ( collection, identifier) pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_mdc_%s.pdf" % Id ### store_output_file_name(Id, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0 total_pages = extract_total_pages(soup) start_page_no = int(identifier) - total_pages s = requests.Session() for page_no in range(0, total_pages): image_url = "http://mdc.cbuc.cat/utils/ajaxhelper/?CISOROOT=%s&CISOPTR=%s"\ "&action=2&DMSCALE=100&DMWIDTH=5000&DMHEIGHT=5000&DMX=0&DMY=0&DMTEXT=&DMROTATE=0" %(collection, start_page_no + page_no) output_file = add_serial_number_to_name( "/data/scratch/BUB_downloads/mdc_%s_" % Id, page_no) status = download_image_to_file(image_url, output_file) print "Downloaded %s," % output_file if status == 1: return 1 final_output_file = "./downloads/bub_mdc_%s_images.tar" % Id command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/mdc_%s_*| xargs -n1 basename)" % ( final_output_file, Id) status = subprocess.check_call(command, shell=True) store_output_file_name(id_for_key, final_output_file) if status == 0: command = "rm /data/scratch/BUB_downloads/mdc_%s_*" % (Id) status = subprocess.check_call(command, shell=True) return 0
def extract_total_pages(Id): """Extract and return total pages in the book""" r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" % Id) source = r.text soup = BeautifulSoup(source) last = soup.findAll('a', attrs={'id': "action-go-last"}) if last != []: last_page_url = last[0]['href'] last_page_no = re.search("seq=(\d+)", last_page_url) return int(last_page_no.group(1))
def extract_total_pages(Id): """Extract and return total pages in the book""" r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" %Id) source = r.text soup = BeautifulSoup(source) last = soup.findAll('a', attrs={'id':"action-go-last"}) if last != []: last_page_url = last[0]['href'] last_page_no = re.search("seq=(\d+)", last_page_url) return int(last_page_no.group(1))
def download_book(Id, id_for_key): """Download book images from HathiTrust and tar them to one file""" Id = get_id_from_string(Id) unique_id = re.search('(.+)_(\d+)', Id) collection, identifier = unique_id.group(1), unique_id.group(2) url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier) r = requests.get(url) source = r.text soup = BeautifulSoup(source) file_type = 'pdf' if 'pdf' in get_meta('metadata_tipus', soup) else 'image' if file_type == 'pdf': tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = "http://mdc.cbuc.cat/utils/getfile/collection/%s/id/%s/filename/1.pdf" %(collection, identifier) pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_mdc_%s.pdf" %Id ### store_output_file_name(Id, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0 total_pages = extract_total_pages(soup) start_page_no = int(identifier)-total_pages s = requests.Session() for page_no in range(0, total_pages): image_url = "http://mdc.cbuc.cat/utils/ajaxhelper/?CISOROOT=%s&CISOPTR=%s"\ "&action=2&DMSCALE=100&DMWIDTH=5000&DMHEIGHT=5000&DMX=0&DMY=0&DMTEXT=&DMROTATE=0" %(collection, start_page_no + page_no) output_file = add_serial_number_to_name("/data/scratch/BUB_downloads/mdc_%s_" %Id, page_no) status = download_image_to_file(image_url, output_file) print "Downloaded %s,"%output_file if status == 1: return 1 final_output_file = "./downloads/bub_mdc_%s_images.tar" %Id command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/mdc_%s_*| xargs -n1 basename)" %(final_output_file, Id) status = subprocess.check_call(command, shell=True) store_output_file_name(id_for_key, final_output_file) if status == 0: command = "rm /data/scratch/BUB_downloads/mdc_%s_*" %(Id) status = subprocess.check_call(command, shell=True) return 0
def get_record_key_from_id(Id): """Extract and return record key associated with the book-id""" r = requests.get("http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" %Id) if r.status_code != 200: return "" info = r.json() record_key = info['records'].keys() if record_key not in (None, ""): return record_key[0] else: return ""
def get_record_key_from_id(Id): """Extract and return record key associated with the book-id""" r = requests.get( "http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" % Id) if r.status_code != 200: return "" info = r.json() record_key = info['records'].keys() if record_key not in (None, ""): return record_key[0] else: return ""
def download_book(Id, id_for_key): redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) r = requests.get(url) source = r.text soup = BeautifulSoup(source) tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url in ("", None): return 1 pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ### store_output_file_name(id_for_key, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0
def download_image_to_file(image_url, output_file): """Download image from url""" image_url = reformat_content(image_url) r = requests.get(image_url, stream=True, verify=False) if r.status_code == 200: image_type = r.headers['content-type'] if image_type == 'image/jpeg': image_ext = 'jpeg' else: if image_type == 'image/png': image_ext = 'png' output_file += image_ext with open(output_file, 'wb') as f: for chunk in r.iter_content(1024): f.write(chunk)
def download_image_to_file(image_url, output_file): """Download image from url""" r = requests.get(image_url, stream=True) if r.status_code == 200: image_type = r.headers['content-type'] if image_type == 'image/jpeg': image_ext = 'jpeg' else: if image_type == 'image/png': image_ext = 'png' output_file += image_ext with open(output_file, 'wb') as f: for chunk in r.iter_content(1024): f.write(chunk) verify_image(output_file)
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) url = 'https://www.googleapis.com/books/v1/volumes/%s?key=%s' % (Id, key) r = requests.get(url, headers={'referer': "tools.wmflabs.org/bub"}) if r.status_code == 404: return 1 if r.status_code == 403: return 7 if r.status_code != 200: return 10 book_info = r.json() if book_info['accessInfo']['viewability'] != "ALL_PAGES": return 3 keys1 = book_info['volumeInfo'].keys() return dict( image_url=book_info['volumeInfo']['imageLinks']['small'] if 'small' in book_info['volumeInfo']['imageLinks'].keys() else "", thumbnail_url=book_info['volumeInfo']['imageLinks']['thumbnail'] if 'thumbnail' in book_info['volumeInfo']['imageLinks'].keys() else "", printType=book_info['volumeInfo']['printType'] if 'printType' in book_info['volumeInfo'].keys() else "", title=book_info['volumeInfo']['title'] if 'title' in keys1 else "", subtitle=book_info['volumeInfo']['subtitle'] if 'subtitle' in keys1 else "", author=book_info['volumeInfo']['authors'][0] if 'authors' in keys1 else "", publisher=book_info['volumeInfo']['publisher'] if 'publisher' in keys1 else "", publishedDate=book_info['volumeInfo']['publishedDate'] if 'publishedDate' in keys1 else "", description=re.sub('<[^<]+?>', '', book_info['volumeInfo']['description']) if 'description' in keys1 else "", infoLink=book_info['volumeInfo']['infoLink'] if 'infoLink' in keys1 else "", publicDomain=book_info['accessInfo']['publicDomain'] if 'publicDomain' in book_info['accessInfo'].keys() else "", language=book_info['volumeInfo']['language'] if 'language' in book_info['volumeInfo'].keys() else "", scanner="google", sponser="Google")
def metadata(Id): """Return book information and meta-data""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text if "Página não encontrada".decode('utf-8') in source: return 1 soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 thumbnail_url = extract_thumbnail_url(soup, url) return dict( image_url = thumbnail_url, thumbnail_url = thumbnail_url, printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DCTERMS.issued", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)), scanner = extract_base_domain(url), sponser = extract_base_domain(url) )
def download_book(Id, id_for_key): """Download book images from HathiTrust and tar them to one file""" s = requests.Session() Id_key = get_id_from_record_key(Id) r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" %Id_key) total_pages = extract_total_pages(Id_key) for page_no in range(1, total_pages+1): image_url = "https://babel.hathitrust.org/cgi/htd/volume/pageimage/%s/%s"%(Id_key, page_no) #output_file = "./downloads/ht_%s_%s." %(Id, page_no) output_file = add_serial_number_to_name("/data/scratch/BUB_downloads/ht_%s_" %Id, page_no) status = download_image_to_file(image_url, output_file) if status == 1: return 1 final_output_file = "./downloads/bub_ht_%s_images.tar" %Id command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/ht_%s_*| xargs -n1 basename)" %(final_output_file, Id) status = subprocess.check_call(command, shell=True) store_output_file_name(id_for_key, final_output_file) if status == 0: command = "rm /data/scratch/BUB_downloads/ht_%s_*" %(Id) status = subprocess.check_call(command, shell=True) return 0
def verify_id(Id_string): """Verify the Id and accessViewStatus(public-domain) for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 try: r = requests.get('https://www.googleapis.com/books/v1/volumes/%s?fields=accessInfo%%2Fviewability&key=%s' %(Id, key), headers={'referer': "tools.wmflabs.org/bub"} ) except: return 1 if r.status_code == 404: return 1 if r.status_code == 403: #when GB Daily Quota(1000 requests) finished return 7 if r.status_code != 200: return 10 else: book_info = r.json() if book_info['accessInfo']['viewability'] != "ALL_PAGES": return 2 else: return 0
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) unique_id = re.search('(.+)_(\d+)', Id) collection, identifier = unique_id.group(1), unique_id.group(2) url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" % ( collection, identifier) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) if 'Text' not in get_meta('metadata_object_type', soup): return 10 return dict( image_url="http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" % (collection, identifier), thumbnail_url= "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" % (collection, identifier), printType="BOOK", title=get_meta('metadata_object_title', soup), subtitle="", author=get_meta('metadata_object_creato', soup), publisher=get_meta('metadata_object_publis', soup), publishedDate=get_meta('metadata_object_date', soup), description="", infoLink=url, publicDomain=True, language=get_meta('metadata_object_langua', soup), scanner="Digital Memory of Catalonia", sponser="Digital Memory of Catalonia")
def metadata(Id): """Return book information and meta-data""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) Id_raw = get_id_from_string(Id, action = 'desanitize') Id_raw = Id_raw[:-1] if Id_raw[-1] == '/' else Id_raw library_url_key = book_key + ":library_url" url = "http://gallica.bnf.fr/%s" %(Id_raw) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) thumbnail_url = 'http://gallica.bnf.fr/%s.thumbnail' %Id_raw source = r.text soup = BeautifulSoup(source) return dict( image_url = thumbnail_url, thumbnail_url = thumbnail_url, printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DC.date", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(get_lang(source)), scanner = "Gallica", sponser = "Gallica" )
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) r = requests.get( "http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" % Id) if r.status_code != 200: return 10 else: book_info = r.json() items = book_info["items"] records = book_info["records"][book_info['records'].keys()[0]] if items == []: return 10 if items[0]["usRightsString"] != "Full view": return 2 xml = records["marc-xml"] soup = BeautifulSoup(xml) htid = items[0]["htid"] return dict( image_url= "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=300" % htid, thumbnail_url= "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=128" % htid, printType="BOOK", title=records['titles'][0], subtitle="", author=extract_author(soup), publisher=extract_publisher(soup), publishedDate=records["publishDates"][0] if "publishDates" in records.keys() else "", description=records['titles'][0] if "titles" in records.keys() else "", infoLink=records["recordURL"] if "recordURL" in records.keys() else "", publicDomain=True if items[0]["rightsCode"] in ("pd", "pdus") else "", language=extract_language(soup), scanner="Hathitrust", sponser="HathiTrust")
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 try: r = requests.get("http://www.brasiliana.usp.br/bbd/handle/%s" %Id) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text if "Página não encontrada".decode('utf-8') in source: return 1 soup = BeautifulSoup(source) if OAI_metadata_content("DC.relation", soup) != "Domínio público".decode('utf-8'): return 2 else: return 0
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 try: r = requests.get("http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" %Id) except: return 10 if r.status_code != 200: return 10 else: book_info = r.json() items = book_info["items"] if items == []: return 10 if items[0]["usRightsString"] != "Full view": return 2 if """<subfield code="s">google</subfield>""" in str(book_info): return 9 else: return 0
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id) unique_id = re.search('(.+)_(\d+)', Id) collection, identifier = unique_id.group(1), unique_id.group(2) url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) if 'Text' not in get_meta('metadata_object_type', soup): return 10 return dict( image_url = "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" %(collection, identifier), thumbnail_url = "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" %(collection, identifier), printType = "BOOK", title = get_meta('metadata_object_title', soup), subtitle = "", author = get_meta('metadata_object_creato', soup), publisher = get_meta('metadata_object_publis', soup), publishedDate = get_meta('metadata_object_date', soup), description = "", infoLink = url, publicDomain = True, language = get_meta('metadata_object_langua', soup), scanner = "Digital Memory of Catalonia", sponser = "Digital Memory of Catalonia" )
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 unique_id = re.search('(.+)_(\d+)', Id) if not unique_id: return 1 collection, identifier = unique_id.group(1), unique_id.group(2) try: r = requests.get("http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier)) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) if 'Text' not in get_meta('metadata_object_type', soup): return 10 else: return 0
def download_book(Id, id_for_key): """Download book images from HathiTrust and tar them to one file""" s = requests.Session() Id_key = get_id_from_record_key(Id) r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" % Id_key) total_pages = extract_total_pages(Id_key) for page_no in range(1, total_pages + 1): image_url = "https://babel.hathitrust.org/cgi/htd/volume/pageimage/%s/%s" % ( Id_key, page_no) #output_file = "./downloads/ht_%s_%s." %(Id, page_no) output_file = add_serial_number_to_name( "/data/scratch/BUB_downloads/ht_%s_" % Id, page_no) status = download_image_to_file(image_url, output_file) if status == 1: return 1 final_output_file = "./downloads/bub_ht_%s_images.tar" % Id command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/ht_%s_*| xargs -n1 basename)" % ( final_output_file, Id) status = subprocess.check_call(command, shell=True) store_output_file_name(id_for_key, final_output_file) if status == 0: command = "rm /data/scratch/BUB_downloads/ht_%s_*" % (Id) status = subprocess.check_call(command, shell=True) return 0
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 try: r = requests.get( "http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" % Id) except: return 10 if r.status_code != 200: return 10 else: book_info = r.json() items = book_info["items"] if items == []: return 10 if items[0]["usRightsString"] != "Full view": return 2 if """<subfield code="s">google</subfield>""" in str(book_info): return 9 else: return 0
def metadata(Id): """Return book information and meta-data""" Id = get_id_from_string(Id, 'desanitize') url = "http://www.brasiliana.usp.br/bbd/handle/%s" %Id try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text if "Página não encontrada".decode('utf-8') in source: return 1 soup = BeautifulSoup(source) if OAI_metadata_content("DC.relation", soup) != "Domínio público".decode('utf-8'): return 2 png_url = "%s?url_ver=Z39.88-2004&rft_id=%s&svc_id=info:lanl-repo/svc/getRegion&svc_val_fmt=info:ofi/fmt:kev:mtx:pdf&svc.format=image/png&svc.clayer=0&svc.level=" %(extract_serverURL(source), extract_item_number(source) ) return dict( image_url = png_url + "1", thumbnail_url = png_url + "0", printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DCTERMS.issued", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)), scanner = "brasiliana.usp.br", sponser = "brasiliana.usp.br" )
def verify_id(Id_string): """Verify the Id and accessViewStatus(public-domain) for the book""" Id = get_id_from_string(Id_string) if Id == None: return 1 try: r = requests.get( 'https://www.googleapis.com/books/v1/volumes/%s?fields=accessInfo%%2Fviewability&key=%s' % (Id, key), headers={'referer': "tools.wmflabs.org/bub"}) except: return 1 if r.status_code == 404: return 1 if r.status_code == 403: #when GB Daily Quota(1000 requests) finished return 7 if r.status_code != 200: return 10 else: book_info = r.json() if book_info['accessInfo']['viewability'] != "ALL_PAGES": return 3 else: return 0
def download_book(url, id_for_key): """Download book images from GB and tar them to one file""" (link_type, link) = get_link_and_type(url) Id = hashlib.md5(url).hexdigest() if link_type == 'wildcard': no_of_pages = int(link[2])+1 - int(link[1]) for page_no in range(0, no_of_pages): image_url = re.sub('\(\*\)', str(int(link[1]) + page_no).zfill(len(link[1])) , link[0]) output_file = add_serial_number_to_name("/data/scratch/BUB_downloads/man_%s_" %Id, page_no+1) download_image_to_file(image_url, output_file) final_output_file = "/data/scratch/BUB_downloads/bub_man_%s_images.tar" %Id command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/man_%s_*| xargs -n1 basename)" %(final_output_file, Id) status = subprocess.check_call(command, shell=True) if status == 0: command = "rm /data/scratch/BUB_downloads/man_%s_*" %(Id) status = subprocess.check_call(command, shell=True) elif link_type == 'pdf': pdf = requests.get(link, stream=True) final_output_file = "/data/scratch/BUB_downloads/bub_man_%s.pdf" %Id with open(final_output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) store_output_file_name(Id, final_output_file) return 0
def check_in_IA(self, library, Id): """Check if book present in IA. Return False if not present else Return Identifier(s)""" url="""http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus(re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330]) r = requests.get(url) ia_info = r.json() numFound = int(ia_info['response']['numFound']) if numFound > 20: numFound = 20 if numFound == 0: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False match_list = [] year_present = 0 self.magazine = 0 for i in range(numFound): match_score = 0 creator_present = 0 #print "index: %s\n" %i if 'source' in ia_info['response']['docs'][i].keys() and self.Id not in (None, ""): source = ia_info['response']['docs'][i]['source'].encode("utf-8") #print "source: %s" %source if self.Id in source: match_score += 20 if 'title' in ia_info['response']['docs'][i].keys() and self.title not in (None, ""): title = ia_info['response']['docs'][i]['title'].encode("utf-8") title_similarity = difflib.SequenceMatcher(None, self.title.lower(), title.lower()).ratio() match_score += 50*title_similarity if 'date' in ia_info['response']['docs'][i].keys(): if parser.parse( ia_info['response']['docs'][i]['date'] ).year == self.year: if self.printType != 'MAGAZINE': match_score += 25 year_present = 1 else: self.magazine = 1 if parser.parse( ia_info['response']['docs'][i]['date'] ).month == self.month: if parser.parse( ia_info['response']['docs'][i]['date'] ).day == self.day: match_score += 25 if 'creator' in ia_info['response']['docs'][i].keys() and self.author not in (None, ""): creator = ia_info['response']['docs'][i]['creator'][0].encode("utf-8") creator_similarity = difflib.SequenceMatcher(None, self.author.lower(), creator.lower()).ratio() match_score += 12*creator_similarity creator_present = 1 if 'publisher' in ia_info['response']['docs'][i].keys() and self.publisher not in (None, ""): publisher = ia_info['response']['docs'][i]['publisher'][0].encode("utf-8") publisher_similarity = difflib.SequenceMatcher(None, self.publisher.lower(), publisher.lower()).ratio() match_score += 6*publisher_similarity if 'language' in ia_info['response']['docs'][i].keys() and self.language not in (None, ""): l = ia_info['response']['docs'][i]['language'][0].encode("utf-8") if len(l) < 5: try: language = lang_code(l) except: language = l else: language = l lang_similarity = difflib.SequenceMatcher(None, self.language.lower(), language.lower()).ratio() match_score += 3*lang_similarity if self.magazine == 0: threshold_score = (0.7)*80 + (25)*year_present + (1 - year_present)*((0.5)*12*creator_present + (0.7)*6*(1-creator_present)) else: threshold_score = (0.7)*80 + 25 match_list.append(ia_info['response']['docs'][i]['identifier']) if match_list != []: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 1, True) return match_list ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False
def get_ia_metadata(ia_identifier): r = requests.get('http://archive.org/metadata/%s' %(ia_identifier) ).json() return r
def get_ia_metadata(ia_identifier): r = requests.get('http://archive.org/metadata/%s' % (ia_identifier)).json() return r
def check_in_IA(self, library, Id): """Check if book present in IA. Return False if not present else Return Identifier(s)""" url = """http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus( re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330]) r = requests.get(url) ia_info = r.json() numFound = int(ia_info['response']['numFound']) if numFound > 20: numFound = 20 if numFound == 0: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False match_list = [] year_present = 0 self.magazine = 0 for i in range(numFound): match_score = 0 creator_present = 0 if 'source' in ia_info['response']['docs'][i].keys( ) and self.Id not in (None, ""): source = ia_info['response']['docs'][i]['source'].encode( "utf-8") if self.Id in source: match_score += 20 if 'title' in ia_info['response']['docs'][i].keys( ) and self.title not in (None, ""): title = ia_info['response']['docs'][i]['title'].encode("utf-8") title_similarity = difflib.SequenceMatcher( None, self.title.lower(), title.lower()).ratio() match_score += 50 * title_similarity if 'date' in ia_info['response']['docs'][i].keys(): if parser.parse(ia_info['response']['docs'][i] ['date']).year == self.year: if self.printType != 'MAGAZINE': match_score += 25 year_present = 1 else: self.magazine = 1 if parser.parse(ia_info['response']['docs'][i] ['date']).month == self.month: if parser.parse(ia_info['response']['docs'][i] ['date']).day == self.day: match_score += 25 if 'creator' in ia_info['response']['docs'][i].keys( ) and self.author not in (None, ""): creator = ia_info['response']['docs'][i]['creator'][0].encode( "utf-8") creator_similarity = difflib.SequenceMatcher( None, self.author.lower(), creator.lower()).ratio() match_score += 12 * creator_similarity creator_present = 1 if 'publisher' in ia_info['response']['docs'][i].keys( ) and self.publisher not in (None, ""): publisher = ia_info['response']['docs'][i]['publisher'][ 0].encode("utf-8") publisher_similarity = difflib.SequenceMatcher( None, self.publisher.lower(), publisher.lower()).ratio() match_score += 6 * publisher_similarity if 'language' in ia_info['response']['docs'][i].keys( ) and self.language not in (None, ""): l = ia_info['response']['docs'][i]['language'][0].encode( "utf-8") if len(l) < 5: try: language = lang_code(l) except: language = l else: language = l lang_similarity = difflib.SequenceMatcher( None, self.language.lower(), language.lower()).ratio() match_score += 3 * lang_similarity if self.magazine == 0: threshold_score = (0.7) * 80 + (25) * year_present + ( 1 - year_present) * ((0.5) * 12 * creator_present + (0.7) * 6 * (1 - creator_present)) else: threshold_score = (0.7) * 80 + 25 match_list.append(ia_info['response']['docs'][i]['identifier']) if match_list != []: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 1, True) return match_list ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False
def get_id_from_record_key(Id): """Extract and return htid associated with the book""" r = requests.get("http://catalog.hathitrust.org/api/volumes/brief/recordnumber/%s.json" %Id) book_info = r.json() htid = book_info["items"][0]["htid"] return htid