Beispiel #1
0
Datei: br.py Projekt: JaonLin/BUB
def download_book(Id, id_for_key):
    Id_raw = sanitize_id_to_url(Id, action = 'desanitize')
    url = "http://www.brasiliana.usp.br/bbd/handle/%s" %Id_raw
    r = requests.get(url)
    source = r.text
    pdf_url = extract_downloadURL(source)
    pdf = requests.get(pdf_url, stream=True)
    output_file = "/data/scratch/BUB_downloads/bub_br_%s.pdf" %Id ###
    store_output_file_name(id_for_key, output_file) 
    with open(output_file, 'wb') as f:
        for chunk in pdf.iter_content(1024):  
            f.write(chunk)  
    return 0            
Beispiel #2
0
Datei: gb.py Projekt: JaonLin/BUB
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id)
    url = 'https://www.googleapis.com/books/v1/volumes/%s?key=%s' %(Id, key)
    r = requests.get(url, headers={'referer': "tools.wmflabs.org/bub"} )
    if r.status_code == 404:
        return 1
    if r.status_code == 403:
        return 7
    if r.status_code != 200:
        return 10
    book_info = r.json()
    if book_info['accessInfo']['viewability'] != "ALL_PAGES":
        return 2
    keys1 = book_info['volumeInfo'].keys()
    return dict(
        image_url = book_info['volumeInfo']['imageLinks']['small'] if 'small' in book_info['volumeInfo']['imageLinks'].keys() else "",
        thumbnail_url = book_info['volumeInfo']['imageLinks']['thumbnail'] if 'thumbnail' in book_info['volumeInfo']['imageLinks'].keys() else "",
        printType = book_info['volumeInfo']['printType'] if 'printType' in book_info['volumeInfo'].keys() else "",
        title = book_info['volumeInfo']['title'] if 'title' in keys1 else "",
        subtitle = book_info['volumeInfo']['subtitle'] if 'subtitle' in keys1 else "",
        author = book_info['volumeInfo']['authors'][0] if 'authors' in keys1 else "",
        publisher = book_info['volumeInfo']['publisher'] if 'publisher' in keys1 else "",
        publishedDate = book_info['volumeInfo']['publishedDate'] if 'publishedDate' in keys1 else "",
        description = re.sub('<[^<]+?>', '', book_info['volumeInfo']['description']) if 'description' in keys1 else "",
        infoLink = book_info['volumeInfo']['infoLink'] if 'infoLink' in keys1 else "",
        publicDomain = book_info['accessInfo']['publicDomain'] if 'publicDomain' in book_info['accessInfo'].keys() else "",
        language = book_info['volumeInfo']['language'] if 'language' in book_info['volumeInfo'].keys() else "",
        scanner = "google",
        sponser = "Google"   
    )            
Beispiel #3
0
def verify_id(Id_string):
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    unique_id = re.search('(.+)_(\d+)', Id)
    if not unique_id:
        return 1
    collection, identifier = unique_id.group(1), unique_id.group(2)
    try:
        r = requests.get(
            "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %
            (collection, identifier))
    except:
        return 1
    if r.status_code == 404:
        return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        if 'Text' not in get_meta('metadata_object_type', soup):
            return 10
        else:
            return 0
Beispiel #4
0
def verify_id(Id_string): 
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'gal', Id_string)
    library_url_key = book_key + ":library_url"
    url = "http://gallica.bnf.fr/%s" %(Id)
    try:
        r = requests.get(url)
    except:
        return 10
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        rights= OAI_metadata_content("DC.rights", soup, burst=True)
        for i in rights:
            if not rights:
                continue
            if i.strip().lower().encode('utf-8') in ('domaine public', 'public domain'):
                return 0               
        return 2
Beispiel #5
0
Datei: ht.py Projekt: JaonLin/BUB
def metadata(Id):
    """Return book information and meta-data"""    
    Id = get_id_from_string(Id)
    r = requests.get("http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" %Id)
    if r.status_code != 200:
        return 10    
    else:
        book_info = r.json()
        items = book_info["items"]
        records = book_info["records"][book_info['records'].keys()[0]]       
        if items == []:
            return 10
        if items[0]["usRightsString"] != "Full view":
            return 2
    xml = records["marc-xml"]
    soup = BeautifulSoup(xml)
    htid = items[0]["htid"]
    return dict(
        image_url = "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=300" %htid,
        thumbnail_url = "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=128" %htid,
        printType = "BOOK",
        title = records['titles'][0],
        subtitle = "",
        author = extract_author(soup),
        publisher = extract_publisher(soup),
        publishedDate = records["publishDates"][0] if "publishDates" in records.keys() else "",
        description = records['titles'][0] if "titles" in records.keys() else "",
        infoLink = records["recordURL"] if "recordURL" in records.keys() else "",
        publicDomain = True if items[0]["rightsCode"] in ("pd", "pdus") else "",
        language = extract_language(soup),
        scanner = "Hathitrust",
        sponser = "HathiTrust"
    )
Beispiel #6
0
def verify_id(Id_string): 
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)
    
   
    try:
        r = requests.get(url)
    except:
        return 10
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
        #else:
        tld = extract_base_domain(url)
        if tld[-1:] == '/':
            tld = tld[:-1]
        pdf_url = get_pdf_link(tld, soup)
        if pdf_url == False:
            return 8
        return 0
Beispiel #7
0
def extract_total_pages(Id_raw):
    r = requests.get('http://gallica.bnf.fr/%s' %Id_raw)
    no= re.findall('Nombre total de vues : ([\d]+)',  r.text)
    if len(no) >0:
        return int(no[0])
    else:
        return None        
Beispiel #8
0
def get_id_from_record_key(Id):
    """Extract and return htid associated with the book"""
    r = requests.get(
        "http://catalog.hathitrust.org/api/volumes/brief/recordnumber/%s.json"
        % Id)
    book_info = r.json()
    htid = book_info["items"][0]["htid"]
    return htid
Beispiel #9
0
def download_book(Id, id_for_key):
    """Download book images from HathiTrust and tar them to one file"""
    Id = get_id_from_string(Id)
    unique_id = re.search('(.+)_(\d+)', Id)
    collection, identifier = unique_id.group(1), unique_id.group(2)
    url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" % (
        collection, identifier)
    r = requests.get(url)
    source = r.text
    soup = BeautifulSoup(source)
    file_type = 'pdf' if 'pdf' in get_meta('metadata_tipus', soup) else 'image'
    if file_type == 'pdf':
        tld = extract_base_domain(url)
        if tld[-1:] == '/':
            tld = tld[:-1]
        pdf_url = "http://mdc.cbuc.cat/utils/getfile/collection/%s/id/%s/filename/1.pdf" % (
            collection, identifier)
        pdf = requests.get(pdf_url, stream=True)
        output_file = "/data/scratch/BUB_downloads/bub_mdc_%s.pdf" % Id  ###
        store_output_file_name(Id, output_file)
        with open(output_file, 'wb') as f:
            for chunk in pdf.iter_content(1024):
                f.write(chunk)
        return 0

    total_pages = extract_total_pages(soup)
    start_page_no = int(identifier) - total_pages
    s = requests.Session()
    for page_no in range(0, total_pages):
        image_url = "http://mdc.cbuc.cat/utils/ajaxhelper/?CISOROOT=%s&CISOPTR=%s"\
        "&action=2&DMSCALE=100&DMWIDTH=5000&DMHEIGHT=5000&DMX=0&DMY=0&DMTEXT=&DMROTATE=0" %(collection, start_page_no + page_no)
        output_file = add_serial_number_to_name(
            "/data/scratch/BUB_downloads/mdc_%s_" % Id, page_no)
        status = download_image_to_file(image_url, output_file)
        print "Downloaded %s," % output_file
        if status == 1:
            return 1
    final_output_file = "./downloads/bub_mdc_%s_images.tar" % Id
    command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/mdc_%s_*| xargs -n1 basename)" % (
        final_output_file, Id)
    status = subprocess.check_call(command, shell=True)
    store_output_file_name(id_for_key, final_output_file)
    if status == 0:
        command = "rm /data/scratch/BUB_downloads/mdc_%s_*" % (Id)
        status = subprocess.check_call(command, shell=True)
    return 0
Beispiel #10
0
def extract_total_pages(Id):
    """Extract and return total pages in the book"""
    r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" % Id)
    source = r.text
    soup = BeautifulSoup(source)
    last = soup.findAll('a', attrs={'id': "action-go-last"})
    if last != []:
        last_page_url = last[0]['href']
        last_page_no = re.search("seq=(\d+)", last_page_url)
        return int(last_page_no.group(1))
Beispiel #11
0
Datei: ht.py Projekt: JaonLin/BUB
def extract_total_pages(Id):
    """Extract and return total pages in the book"""
    r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" %Id)
    source = r.text
    soup = BeautifulSoup(source)
    last = soup.findAll('a', attrs={'id':"action-go-last"})
    if last != []:
        last_page_url = last[0]['href']
        last_page_no = re.search("seq=(\d+)", last_page_url)
        return int(last_page_no.group(1))
Beispiel #12
0
def download_book(Id, id_for_key):  
    """Download book images from HathiTrust and tar them to one file"""   
    Id = get_id_from_string(Id)
    unique_id = re.search('(.+)_(\d+)', Id)
    collection, identifier = unique_id.group(1), unique_id.group(2)
    url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier)  
    r = requests.get(url)
    source = r.text
    soup = BeautifulSoup(source)  
    file_type = 'pdf' if 'pdf' in get_meta('metadata_tipus', soup) else 'image'
    if file_type == 'pdf':
        tld = extract_base_domain(url)
        if tld[-1:] == '/':
            tld = tld[:-1]
        pdf_url = "http://mdc.cbuc.cat/utils/getfile/collection/%s/id/%s/filename/1.pdf" %(collection, identifier)
        pdf = requests.get(pdf_url, stream=True)
        output_file = "/data/scratch/BUB_downloads/bub_mdc_%s.pdf" %Id ###
        store_output_file_name(Id, output_file) 
        with open(output_file, 'wb') as f:
            for chunk in pdf.iter_content(1024):  
                f.write(chunk)  
        return 0    
    
    total_pages = extract_total_pages(soup)
    start_page_no = int(identifier)-total_pages
    s = requests.Session()
    for page_no in range(0, total_pages):
        image_url = "http://mdc.cbuc.cat/utils/ajaxhelper/?CISOROOT=%s&CISOPTR=%s"\
        "&action=2&DMSCALE=100&DMWIDTH=5000&DMHEIGHT=5000&DMX=0&DMY=0&DMTEXT=&DMROTATE=0" %(collection, start_page_no + page_no)
        output_file =  add_serial_number_to_name("/data/scratch/BUB_downloads/mdc_%s_" %Id, page_no)
        status = download_image_to_file(image_url, output_file)
        print "Downloaded %s,"%output_file
	if status == 1:
	    return 1
    final_output_file = "./downloads/bub_mdc_%s_images.tar" %Id
    command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/mdc_%s_*| xargs -n1 basename)" %(final_output_file, Id)
    status = subprocess.check_call(command, shell=True)
    store_output_file_name(id_for_key, final_output_file)
    if status == 0:
        command = "rm /data/scratch/BUB_downloads/mdc_%s_*" %(Id)
        status = subprocess.check_call(command, shell=True)
    return 0
Beispiel #13
0
Datei: ht.py Projekt: JaonLin/BUB
def get_record_key_from_id(Id):
    """Extract and return record key associated with the book-id"""
    r = requests.get("http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" %Id)
    if r.status_code != 200:
        return ""
    info = r.json()
    record_key = info['records'].keys()
    if record_key not in (None, ""):
        return record_key[0]
    else:
        return ""    
Beispiel #14
0
def get_record_key_from_id(Id):
    """Extract and return record key associated with the book-id"""
    r = requests.get(
        "http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" % Id)
    if r.status_code != 200:
        return ""
    info = r.json()
    record_key = info['records'].keys()
    if record_key not in (None, ""):
        return record_key[0]
    else:
        return ""
Beispiel #15
0
def download_book(Id, id_for_key): 
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    r = requests.get(url)
    source = r.text
    soup = BeautifulSoup(source)
    tld = extract_base_domain(url)
    if tld[-1:] == '/':
        tld = tld[:-1]
    pdf_url = get_pdf_link(tld, soup)   
    if pdf_url in ("", None):
        return 1
    pdf = requests.get(pdf_url, stream=True)
    output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ###
    store_output_file_name(id_for_key, output_file)
    with open(output_file, 'wb') as f:
        for chunk in pdf.iter_content(1024):  
            f.write(chunk)  
    return 0            
Beispiel #16
0
Datei: gb.py Projekt: nemobis/BUB
def download_image_to_file(image_url, output_file):
    """Download image from url"""
    image_url = reformat_content(image_url)
    r = requests.get(image_url, stream=True, verify=False)
    if r.status_code == 200:
        image_type = r.headers['content-type']
        if image_type == 'image/jpeg':
            image_ext = 'jpeg'
        else:
            if image_type == 'image/png':
                image_ext = 'png'
        output_file += image_ext
        with open(output_file, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
Beispiel #17
0
def download_image_to_file(image_url, output_file):
    """Download image from url"""    
    r = requests.get(image_url, stream=True)
    if r.status_code == 200:
        image_type = r.headers['content-type']
        if image_type == 'image/jpeg':
            image_ext = 'jpeg'
        else:
          if image_type == 'image/png':
              image_ext = 'png'
        output_file += image_ext        
        with open(output_file, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        verify_image(output_file)
Beispiel #18
0
Datei: gb.py Projekt: nemobis/BUB
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id)
    url = 'https://www.googleapis.com/books/v1/volumes/%s?key=%s' % (Id, key)
    r = requests.get(url, headers={'referer': "tools.wmflabs.org/bub"})
    if r.status_code == 404:
        return 1
    if r.status_code == 403:
        return 7
    if r.status_code != 200:
        return 10
    book_info = r.json()
    if book_info['accessInfo']['viewability'] != "ALL_PAGES":
        return 3
    keys1 = book_info['volumeInfo'].keys()
    return dict(
        image_url=book_info['volumeInfo']['imageLinks']['small']
        if 'small' in book_info['volumeInfo']['imageLinks'].keys() else "",
        thumbnail_url=book_info['volumeInfo']['imageLinks']['thumbnail']
        if 'thumbnail' in book_info['volumeInfo']['imageLinks'].keys() else "",
        printType=book_info['volumeInfo']['printType']
        if 'printType' in book_info['volumeInfo'].keys() else "",
        title=book_info['volumeInfo']['title'] if 'title' in keys1 else "",
        subtitle=book_info['volumeInfo']['subtitle']
        if 'subtitle' in keys1 else "",
        author=book_info['volumeInfo']['authors'][0]
        if 'authors' in keys1 else "",
        publisher=book_info['volumeInfo']['publisher']
        if 'publisher' in keys1 else "",
        publishedDate=book_info['volumeInfo']['publishedDate']
        if 'publishedDate' in keys1 else "",
        description=re.sub('<[^<]+?>', '',
                           book_info['volumeInfo']['description'])
        if 'description' in keys1 else "",
        infoLink=book_info['volumeInfo']['infoLink']
        if 'infoLink' in keys1 else "",
        publicDomain=book_info['accessInfo']['publicDomain']
        if 'publicDomain' in book_info['accessInfo'].keys() else "",
        language=book_info['volumeInfo']['language']
        if 'language' in book_info['volumeInfo'].keys() else "",
        scanner="google",
        sponser="Google")
Beispiel #19
0
def metadata(Id):
    """Return book information and meta-data"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        if "Página não encontrada".decode('utf-8') in source:
            return 1
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
    thumbnail_url = extract_thumbnail_url(soup, url)        
    return dict(
        image_url = thumbnail_url,
        thumbnail_url = thumbnail_url,
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DCTERMS.issued", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)),
        scanner = extract_base_domain(url),
        sponser = extract_base_domain(url)
    )
Beispiel #20
0
Datei: ht.py Projekt: JaonLin/BUB
def download_book(Id, id_for_key):  
    """Download book images from HathiTrust and tar them to one file"""   
    s = requests.Session()
    Id_key = get_id_from_record_key(Id)
    r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" %Id_key)
    total_pages = extract_total_pages(Id_key)
    for page_no in range(1, total_pages+1):
        image_url = "https://babel.hathitrust.org/cgi/htd/volume/pageimage/%s/%s"%(Id_key, page_no)
        #output_file = "./downloads/ht_%s_%s." %(Id, page_no)
        output_file =  add_serial_number_to_name("/data/scratch/BUB_downloads/ht_%s_" %Id, page_no)
        status = download_image_to_file(image_url, output_file)
	if status == 1:
	    return 1
    final_output_file = "./downloads/bub_ht_%s_images.tar" %Id
    command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/ht_%s_*| xargs -n1 basename)" %(final_output_file, Id)
    status = subprocess.check_call(command, shell=True)
    store_output_file_name(id_for_key, final_output_file)
    if status == 0:
        command = "rm /data/scratch/BUB_downloads/ht_%s_*" %(Id)
        status = subprocess.check_call(command, shell=True)
    return 0        
Beispiel #21
0
Datei: gb.py Projekt: JaonLin/BUB
def verify_id(Id_string):
    """Verify the Id and accessViewStatus(public-domain) for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    try:
        r = requests.get('https://www.googleapis.com/books/v1/volumes/%s?fields=accessInfo%%2Fviewability&key=%s' %(Id, key), headers={'referer': "tools.wmflabs.org/bub"} )
    except:
        return 1
    if r.status_code == 404:
	return 1
    if r.status_code == 403:     #when GB Daily Quota(1000 requests) finished
        return 7
    if r.status_code != 200:
        return 10
    else:
        book_info = r.json()
        if book_info['accessInfo']['viewability'] != "ALL_PAGES":
            return 2
        else:
            return 0
Beispiel #22
0
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id)
    unique_id = re.search('(.+)_(\d+)', Id)
    collection, identifier = unique_id.group(1), unique_id.group(2)
    url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" % (
        collection, identifier)

    try:
        r = requests.get(url)
    except:
        return 1

    if r.status_code == 404:
        return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        if 'Text' not in get_meta('metadata_object_type', soup):
            return 10
    return dict(
        image_url="http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s"
        % (collection, identifier),
        thumbnail_url=
        "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" %
        (collection, identifier),
        printType="BOOK",
        title=get_meta('metadata_object_title', soup),
        subtitle="",
        author=get_meta('metadata_object_creato', soup),
        publisher=get_meta('metadata_object_publis', soup),
        publishedDate=get_meta('metadata_object_date', soup),
        description="",
        infoLink=url,
        publicDomain=True,
        language=get_meta('metadata_object_langua', soup),
        scanner="Digital Memory of Catalonia",
        sponser="Digital Memory of Catalonia")
Beispiel #23
0
def metadata(Id):
    """Return book information and meta-data"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    Id_raw = get_id_from_string(Id, action = 'desanitize')
    Id_raw = Id_raw[:-1] if Id_raw[-1] == '/' else Id_raw
    library_url_key = book_key + ":library_url"
    url = "http://gallica.bnf.fr/%s" %(Id_raw) 
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
        return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
    thumbnail_url = 'http://gallica.bnf.fr/%s.thumbnail' %Id_raw       
    source = r.text
    soup = BeautifulSoup(source)
    return dict(
        image_url = thumbnail_url,
        thumbnail_url = thumbnail_url,
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DC.date", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(get_lang(source)),
        scanner = "Gallica",
        sponser = "Gallica"
    )
Beispiel #24
0
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id)
    r = requests.get(
        "http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" %
        Id)
    if r.status_code != 200:
        return 10
    else:
        book_info = r.json()
        items = book_info["items"]
        records = book_info["records"][book_info['records'].keys()[0]]
        if items == []:
            return 10
        if items[0]["usRightsString"] != "Full view":
            return 2
    xml = records["marc-xml"]
    soup = BeautifulSoup(xml)
    htid = items[0]["htid"]
    return dict(
        image_url=
        "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=300" %
        htid,
        thumbnail_url=
        "http://babel.hathitrust.org/cgi/imgsrv/image?id=%s;seq=1;width=128" %
        htid,
        printType="BOOK",
        title=records['titles'][0],
        subtitle="",
        author=extract_author(soup),
        publisher=extract_publisher(soup),
        publishedDate=records["publishDates"][0]
        if "publishDates" in records.keys() else "",
        description=records['titles'][0] if "titles" in records.keys() else "",
        infoLink=records["recordURL"] if "recordURL" in records.keys() else "",
        publicDomain=True if items[0]["rightsCode"] in ("pd", "pdus") else "",
        language=extract_language(soup),
        scanner="Hathitrust",
        sponser="HathiTrust")
Beispiel #25
0
Datei: br.py Projekt: JaonLin/BUB
def verify_id(Id_string):
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    try:
        r = requests.get("http://www.brasiliana.usp.br/bbd/handle/%s" %Id)
    except:
        return 1
    if r.status_code == 404:
	return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        if "Página não encontrada".decode('utf-8') in source:
            return 1
        soup = BeautifulSoup(source)
        if OAI_metadata_content("DC.relation", soup) != "Domínio público".decode('utf-8'):
            return 2
        else:
            return 0
Beispiel #26
0
Datei: ht.py Projekt: JaonLin/BUB
def verify_id(Id_string):
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    try:
        r = requests.get("http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json" %Id)
    except:
        return 10
    if r.status_code != 200:
        return 10
    else:
        book_info = r.json()
        items = book_info["items"]
        if items == []:
            return 10
        if items[0]["usRightsString"] != "Full view":
            return 2
        if """<subfield code="s">google</subfield>""" in str(book_info):
            return 9
        else:
            return 0
Beispiel #27
0
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id)
    unique_id = re.search('(.+)_(\d+)', Id)
    collection, identifier = unique_id.group(1), unique_id.group(2)
    url = "http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier)
    
    try:
        r = requests.get(url)
    except:
        return 1
        
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        if 'Text' not in  get_meta('metadata_object_type', soup):
            return 10 
    return dict(
        image_url = "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" %(collection, identifier),
        thumbnail_url = "http://mdc.cbuc.cat/utils/getthumbnail/collection/%s/id/%s" %(collection, identifier),
        printType = "BOOK",
        title = get_meta('metadata_object_title', soup),
        subtitle = "",
        author = get_meta('metadata_object_creato', soup),
        publisher = get_meta('metadata_object_publis', soup),  
        publishedDate = get_meta('metadata_object_date', soup),
        description = "",
        infoLink = url,
        publicDomain = True,
        language = get_meta('metadata_object_langua', soup),
        scanner = "Digital Memory of Catalonia",
        sponser = "Digital Memory of Catalonia"
    )            
Beispiel #28
0
def verify_id(Id_string):
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    unique_id = re.search('(.+)_(\d+)', Id)
    if not unique_id:
        return 1
    collection, identifier = unique_id.group(1), unique_id.group(2)
    try:
        r = requests.get("http://mdc.cbuc.cat/cdm/compoundobject/collection/%s/id/%s/rec/" %(collection, identifier))
    except:
        return 1
    if r.status_code == 404:
	return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        if 'Text' not in get_meta('metadata_object_type', soup):
            return 10
        else:
            return 0
Beispiel #29
0
def download_book(Id, id_for_key):
    """Download book images from HathiTrust and tar them to one file"""
    s = requests.Session()
    Id_key = get_id_from_record_key(Id)
    r = requests.get("http://babel.hathitrust.org/cgi/pt?id=%s" % Id_key)
    total_pages = extract_total_pages(Id_key)
    for page_no in range(1, total_pages + 1):
        image_url = "https://babel.hathitrust.org/cgi/htd/volume/pageimage/%s/%s" % (
            Id_key, page_no)
        #output_file = "./downloads/ht_%s_%s." %(Id, page_no)
        output_file = add_serial_number_to_name(
            "/data/scratch/BUB_downloads/ht_%s_" % Id, page_no)
        status = download_image_to_file(image_url, output_file)
        if status == 1:
            return 1
    final_output_file = "./downloads/bub_ht_%s_images.tar" % Id
    command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/ht_%s_*| xargs -n1 basename)" % (
        final_output_file, Id)
    status = subprocess.check_call(command, shell=True)
    store_output_file_name(id_for_key, final_output_file)
    if status == 0:
        command = "rm /data/scratch/BUB_downloads/ht_%s_*" % (Id)
        status = subprocess.check_call(command, shell=True)
    return 0
Beispiel #30
0
def verify_id(Id_string):
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    try:
        r = requests.get(
            "http://catalog.hathitrust.org/api/volumes/full/recordnumber/%s.json"
            % Id)
    except:
        return 10
    if r.status_code != 200:
        return 10
    else:
        book_info = r.json()
        items = book_info["items"]
        if items == []:
            return 10
        if items[0]["usRightsString"] != "Full view":
            return 2
        if """<subfield code="s">google</subfield>""" in str(book_info):
            return 9
        else:
            return 0
Beispiel #31
0
Datei: br.py Projekt: JaonLin/BUB
def metadata(Id):
    """Return book information and meta-data"""
    Id = get_id_from_string(Id, 'desanitize')
    url = "http://www.brasiliana.usp.br/bbd/handle/%s" %Id
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        if "Página não encontrada".decode('utf-8') in source:
            return 1
        soup = BeautifulSoup(source)
        if OAI_metadata_content("DC.relation", soup) != "Domínio público".decode('utf-8'):
            return 2
    png_url = "%s?url_ver=Z39.88-2004&rft_id=%s&svc_id=info:lanl-repo/svc/getRegion&svc_val_fmt=info:ofi/fmt:kev:mtx:pdf&svc.format=image/png&svc.clayer=0&svc.level=" %(extract_serverURL(source), extract_item_number(source) ) 
    return dict(
        image_url = png_url + "1",
        thumbnail_url = png_url + "0",
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DCTERMS.issued", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)),
        scanner = "brasiliana.usp.br",
        sponser = "brasiliana.usp.br"
    )
Beispiel #32
0
Datei: gb.py Projekt: nemobis/BUB
def verify_id(Id_string):
    """Verify the Id and accessViewStatus(public-domain) for the book"""
    Id = get_id_from_string(Id_string)
    if Id == None:
        return 1
    try:
        r = requests.get(
            'https://www.googleapis.com/books/v1/volumes/%s?fields=accessInfo%%2Fviewability&key=%s'
            % (Id, key),
            headers={'referer': "tools.wmflabs.org/bub"})
    except:
        return 1
    if r.status_code == 404:
        return 1
    if r.status_code == 403:  #when GB Daily Quota(1000 requests) finished
        return 7
    if r.status_code != 200:
        return 10
    else:
        book_info = r.json()
        if book_info['accessInfo']['viewability'] != "ALL_PAGES":
            return 3
        else:
            return 0
Beispiel #33
0
def download_book(url, id_for_key):  
    """Download book images from GB and tar them to one file"""   
    (link_type, link) = get_link_and_type(url)
    Id = hashlib.md5(url).hexdigest()
    if link_type == 'wildcard':
        no_of_pages = int(link[2])+1 - int(link[1])
        for page_no in range(0, no_of_pages):
            image_url = re.sub('\(\*\)', str(int(link[1]) + page_no).zfill(len(link[1])) , link[0])
            output_file =  add_serial_number_to_name("/data/scratch/BUB_downloads/man_%s_" %Id, page_no+1)
            download_image_to_file(image_url, output_file)
        final_output_file = "/data/scratch/BUB_downloads/bub_man_%s_images.tar" %Id   
        command = "tar -cf %s --directory=/data/scratch/BUB_downloads/ $(ls /data/scratch/BUB_downloads/man_%s_*| xargs -n1 basename)" %(final_output_file, Id)
        status = subprocess.check_call(command, shell=True)
        if status == 0:
            command = "rm /data/scratch/BUB_downloads/man_%s_*" %(Id)
            status = subprocess.check_call(command, shell=True)
    elif link_type == 'pdf':
        pdf = requests.get(link, stream=True)
        final_output_file = "/data/scratch/BUB_downloads/bub_man_%s.pdf" %Id
        with open(final_output_file, 'wb') as f:
            for chunk in pdf.iter_content(1024):  
                f.write(chunk)         
    store_output_file_name(Id, final_output_file)    
    return 0
Beispiel #34
0
    def check_in_IA(self, library, Id):
        """Check if book present in IA.
        Return False if not present else Return Identifier(s)"""
        
        url="""http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus(re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330])  
        r = requests.get(url)
        ia_info = r.json()
        numFound = int(ia_info['response']['numFound'])
        if numFound > 20:
            numFound = 20
        if numFound == 0:
	    ia_response_key = self.book_key + ":ia_response"
	    redis_py.set(ia_response_key, 0, True)
            return False
        match_list = []
        year_present = 0
        self.magazine = 0
        for i in range(numFound):
            match_score = 0
            creator_present = 0 
	    #print "index: %s\n" %i
            if 'source' in ia_info['response']['docs'][i].keys() and self.Id not in (None, ""):
                source = ia_info['response']['docs'][i]['source'].encode("utf-8")
		#print "source: %s" %source
                if self.Id in source:
                    match_score += 20 
            if 'title' in ia_info['response']['docs'][i].keys() and self.title not in (None, ""):
                title = ia_info['response']['docs'][i]['title'].encode("utf-8")
                title_similarity = difflib.SequenceMatcher(None, self.title.lower(), title.lower()).ratio() 
                match_score += 50*title_similarity                 
            if 'date' in ia_info['response']['docs'][i].keys():
                if parser.parse( ia_info['response']['docs'][i]['date'] ).year == self.year:
                    if self.printType != 'MAGAZINE':
                        match_score += 25
                        year_present = 1
                    else:
                        self.magazine = 1
                        if parser.parse( ia_info['response']['docs'][i]['date'] ).month == self.month:
                            if parser.parse( ia_info['response']['docs'][i]['date'] ).day == self.day:
                                match_score += 25
            if 'creator' in ia_info['response']['docs'][i].keys() and self.author not in (None, ""):
                creator = ia_info['response']['docs'][i]['creator'][0].encode("utf-8")
                creator_similarity = difflib.SequenceMatcher(None, self.author.lower(), creator.lower()).ratio()  
                match_score += 12*creator_similarity   
                creator_present = 1           
            if 'publisher' in ia_info['response']['docs'][i].keys() and self.publisher not in (None, ""):
                publisher = ia_info['response']['docs'][i]['publisher'][0].encode("utf-8")
                publisher_similarity = difflib.SequenceMatcher(None, self.publisher.lower(), publisher.lower()).ratio()
                match_score += 6*publisher_similarity                               
            if 'language' in ia_info['response']['docs'][i].keys() and self.language not in (None, ""):
                l = ia_info['response']['docs'][i]['language'][0].encode("utf-8")
                if len(l) < 5:
                    try:
                        language = lang_code(l)
                    except:
                        language = l
                else:
                    language = l
                lang_similarity = difflib.SequenceMatcher(None, self.language.lower(), language.lower()).ratio()
                match_score += 3*lang_similarity  
            if self.magazine == 0:
                threshold_score = (0.7)*80 + (25)*year_present + (1 - year_present)*((0.5)*12*creator_present + (0.7)*6*(1-creator_present))
            else:
                threshold_score = (0.7)*80 + 25             
            match_list.append(ia_info['response']['docs'][i]['identifier'])                     
        if match_list != []:
            ia_response_key = self.book_key + ":ia_response"
            redis_py.set(ia_response_key, 1, True)
            return match_list
        ia_response_key = self.book_key + ":ia_response"
        redis_py.set(ia_response_key, 0, True) 
        return False    
Beispiel #35
0
def get_ia_metadata(ia_identifier):
    r = requests.get('http://archive.org/metadata/%s' %(ia_identifier) ).json()
    return r
Beispiel #36
0
def get_ia_metadata(ia_identifier):
    r = requests.get('http://archive.org/metadata/%s' % (ia_identifier)).json()
    return r
Beispiel #37
0
    def check_in_IA(self, library, Id):
        """Check if book present in IA.
        Return False if not present else Return Identifier(s)"""

        url = """http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus(
            re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330])
        r = requests.get(url)
        ia_info = r.json()
        numFound = int(ia_info['response']['numFound'])
        if numFound > 20:
            numFound = 20
        if numFound == 0:
            ia_response_key = self.book_key + ":ia_response"
            redis_py.set(ia_response_key, 0, True)
            return False
        match_list = []
        year_present = 0
        self.magazine = 0
        for i in range(numFound):
            match_score = 0
            creator_present = 0
            if 'source' in ia_info['response']['docs'][i].keys(
            ) and self.Id not in (None, ""):
                source = ia_info['response']['docs'][i]['source'].encode(
                    "utf-8")
                if self.Id in source:
                    match_score += 20
            if 'title' in ia_info['response']['docs'][i].keys(
            ) and self.title not in (None, ""):
                title = ia_info['response']['docs'][i]['title'].encode("utf-8")
                title_similarity = difflib.SequenceMatcher(
                    None, self.title.lower(), title.lower()).ratio()
                match_score += 50 * title_similarity
            if 'date' in ia_info['response']['docs'][i].keys():
                if parser.parse(ia_info['response']['docs'][i]
                                ['date']).year == self.year:
                    if self.printType != 'MAGAZINE':
                        match_score += 25
                        year_present = 1
                    else:
                        self.magazine = 1
                        if parser.parse(ia_info['response']['docs'][i]
                                        ['date']).month == self.month:
                            if parser.parse(ia_info['response']['docs'][i]
                                            ['date']).day == self.day:
                                match_score += 25
            if 'creator' in ia_info['response']['docs'][i].keys(
            ) and self.author not in (None, ""):
                creator = ia_info['response']['docs'][i]['creator'][0].encode(
                    "utf-8")
                creator_similarity = difflib.SequenceMatcher(
                    None, self.author.lower(), creator.lower()).ratio()
                match_score += 12 * creator_similarity
                creator_present = 1
            if 'publisher' in ia_info['response']['docs'][i].keys(
            ) and self.publisher not in (None, ""):
                publisher = ia_info['response']['docs'][i]['publisher'][
                    0].encode("utf-8")
                publisher_similarity = difflib.SequenceMatcher(
                    None, self.publisher.lower(), publisher.lower()).ratio()
                match_score += 6 * publisher_similarity
            if 'language' in ia_info['response']['docs'][i].keys(
            ) and self.language not in (None, ""):
                l = ia_info['response']['docs'][i]['language'][0].encode(
                    "utf-8")
                if len(l) < 5:
                    try:
                        language = lang_code(l)
                    except:
                        language = l
                else:
                    language = l
                lang_similarity = difflib.SequenceMatcher(
                    None, self.language.lower(), language.lower()).ratio()
                match_score += 3 * lang_similarity
            if self.magazine == 0:
                threshold_score = (0.7) * 80 + (25) * year_present + (
                    1 - year_present) * ((0.5) * 12 * creator_present +
                                         (0.7) * 6 * (1 - creator_present))
            else:
                threshold_score = (0.7) * 80 + 25
            match_list.append(ia_info['response']['docs'][i]['identifier'])
        if match_list != []:
            ia_response_key = self.book_key + ":ia_response"
            redis_py.set(ia_response_key, 1, True)
            return match_list
        ia_response_key = self.book_key + ":ia_response"
        redis_py.set(ia_response_key, 0, True)
        return False
Beispiel #38
0
Datei: ht.py Projekt: JaonLin/BUB
def get_id_from_record_key(Id):
    """Extract and return htid associated with the book"""
    r = requests.get("http://catalog.hathitrust.org/api/volumes/brief/recordnumber/%s.json" %Id)
    book_info = r.json()
    htid = book_info["items"][0]["htid"]
    return htid