Exemple #1
0
def wait_and_add_to_queue(q_bulk_order):
    """Parse Id's from bulk-order queue(accepts requests from web) entry and add to mass-worker queue."""
    log.write("%s  Started wait_and_add_to_queue\n" % datetime.now())
    log.flush()
    while True:
        info = json.loads(q_bulk_order.pop_and_remove())
        ids = info[0]
        email = info[1]
        language = info[2]
        ids = re.findall(r'[^,(\r\n)\s]+', ids)
        no = len(ids)
        q_mass_worker = get_shortest_queue()
        library_id = 'gb'
        redis_key3 = keys.redis_key3
        redis = redis_py.Redis()
        for book_id in ids:
            book_id = gb.get_id_from_string(book_id)
            book_key = "%s:%s:%s" % (redis_key3, library_id, book_id)
            book_language_key = book_key + ":language"
            redis_py.set(book_language_key, language, True)
            q_mass_worker.add(book_id)
        q_mass_worker.add(json.dumps((email, no)))
        bulk_order_log.write("%s  Received %s entries from %s\n" %
                             (datetime.now(), no, email))
        bulk_order_log.flush()
Exemple #2
0
def verify_id(Id_string): 
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)
    
   
    try:
        r = requests.get(url)
    except:
        return 10
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
        #else:
        tld = extract_base_domain(url)
        if tld[-1:] == '/':
            tld = tld[:-1]
        pdf_url = get_pdf_link(tld, soup)
        if pdf_url == False:
            return 8
        return 0
Exemple #3
0
def store_output_file_name(Id, output_file):
    """Save output file name to redis-memory"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = redis_key3+":man:%s" %Id
    output_file_key = book_key + ":output_file"
    redis_py.set(output_file_key, output_file, True)
Exemple #4
0
def verify_id(Id_string): 
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'gal', Id_string)
    library_url_key = book_key + ":library_url"
    url = "http://gallica.bnf.fr/%s" %(Id)
    try:
        r = requests.get(url)
    except:
        return 10
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        rights= OAI_metadata_content("DC.rights", soup, burst=True)
        for i in rights:
            if not rights:
                continue
            if i.strip().lower().encode('utf-8') in ('domaine public', 'public domain'):
                return 0               
        return 2
Exemple #5
0
def check_if_upload_ready():
    redis = redis_py.Redis()
    redis_key2 = keys.redis_key2
    lock_key1 = keys.lock_key1
    q = redis_py.Queue(redis_key2)
    Lock = redis_py.Lock(lock_key1)
    while True:
        book_keys = q.pop(-1)
        if book_keys is False:
            time.sleep(2)
            continue
        for book_key in book_keys:
            uploaded = 0
            ia_identifier = redis.get(book_key + ":ia_identifier")
            ia_identifier = json.loads(ia_identifier)
            if isinstance(ia_identifier, list):
                Lock.acquire(timeout=60 * 2)
                users_request = redis_py.smembers(book_key + ":requests", True)
                if users_request != None:
                    redis.delete(book_key + ":requests")
                    remove_request_db(users_request, book_key)
                    remove_from_db(users_request)
                Lock.release()
                q.remove(book_key)
                if users_request != None:
                    send_email(users_request, ia_identifier, book_key=book_key)
                email_progress_key = book_key + ":email_progress"
                redis_py.set(email_progress_key, 1, True)
                delete_from_global_queue(book_key)
                continue
            else:
                r = get_ia_metadata(ia_identifier)
                if 'metadata' in r.keys():
                    if 'ocr' in r['metadata'].keys():
                        if r['metadata'][
                                'ocr'] == 'language not currently OCRable':
                            uploaded = 2
                if 'DjVuTXT' in str(r) or 'Djvu XML' in str(r):
                    uploaded = 1
                if uploaded != 0:
                    Lock.acquire(timeout=60 * 2)
                    users_request = redis_py.smembers(book_key + ":requests",
                                                      True)
                    if users_request != None:
                        redis.delete(book_key + ":requests")
                        remove_request_db(users_request, book_key)
                        remove_from_db(users_request)
                    Lock.release()
                    q.remove(book_key)
                    if users_request != None:
                        send_email(users_request, str(ia_identifier))
                    email_progress_key = book_key + ":email_progress"
                    redis_py.set(email_progress_key, 1, True)
                    delete_from_global_queue(book_key)
                    OCR_progress_key = book_key + ":OCR_progress"
                    redis_py.set(OCR_progress_key, 1, True)
                else:
                    continue
        time.sleep(2)
Exemple #6
0
 def __init__(self, value):
     """Assign variable"""
     redis_key3 = keys.redis_key3 
     self.library = 'gb'
     self.library_name = 'Google-Books'
     self.Id = value.encode('utf-8')
     self.ia_identifier = "bub_" + self.library + "_" + value
     self.book_key = "%s:%s:%s" %(redis_key3, self.library, value) 
     self.redis = redis_py.Redis()
     self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, self.Id) 
Exemple #7
0
def metadata(Id):
    """Return book information and meta-data"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        if "Página não encontrada".decode('utf-8') in source:
            return 1
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
    thumbnail_url = extract_thumbnail_url(soup, url)        
    return dict(
        image_url = thumbnail_url,
        thumbnail_url = thumbnail_url,
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DCTERMS.issued", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)),
        scanner = extract_base_domain(url),
        sponser = extract_base_domain(url)
    )
Exemple #8
0
def metadata(Id):
    """Return book information and meta-data"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    Id_raw = get_id_from_string(Id, action = 'desanitize')
    Id_raw = Id_raw[:-1] if Id_raw[-1] == '/' else Id_raw
    library_url_key = book_key + ":library_url"
    url = "http://gallica.bnf.fr/%s" %(Id_raw) 
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
        return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
    thumbnail_url = 'http://gallica.bnf.fr/%s.thumbnail' %Id_raw       
    source = r.text
    soup = BeautifulSoup(source)
    return dict(
        image_url = thumbnail_url,
        thumbnail_url = thumbnail_url,
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DC.date", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(get_lang(source)),
        scanner = "Gallica",
        sponser = "Gallica"
    )
Exemple #9
0
def download_book(Id, id_for_key): 
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    r = requests.get(url)
    source = r.text
    soup = BeautifulSoup(source)
    tld = extract_base_domain(url)
    if tld[-1:] == '/':
        tld = tld[:-1]
    pdf_url = get_pdf_link(tld, soup)   
    if pdf_url in ("", None):
        return 1
    pdf = requests.get(pdf_url, stream=True)
    output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ###
    store_output_file_name(id_for_key, output_file)
    with open(output_file, 'wb') as f:
        for chunk in pdf.iter_content(1024):  
            f.write(chunk)  
    return 0            
Exemple #10
0
 def __init__(self, value):
     """Assign variable, and get metadata from cache"""
     redis_key3 = keys.redis_key3
     self.redis_key3 = redis_key3
     self.redis = redis_py.Redis()
     if isinstance(value, (int, long, float, complex)):
         db = mysql_py.Db()
         values = db.execute(
             'select library, book_id from request where sno = %s;',
             value)[0]
         db.close()
         self.library = values[0]
         self.Id = values[1].encode('utf-8')
         self.book_key = "%s:%s:%s" % (redis_key3, self.library, self.Id)
         self.redis.set(redis_key3 + ":ongoing_job_identifier", self.Id)
         self.ia_identifier = None
         self.id_for_key = self.Id
     else:
         self.library = value['library']
         self.Id = value['Id']
         self.ia_identifier = "bub_" + self.library + "_" + value[
             'ia_identifier_suffix']
         self.book_key = "%s:%s:%s" % (redis_key3, self.library,
                                       value['ia_identifier_suffix'])
         self.redis.set(redis_key3 + ":ongoing_job_identifier",
                        value['ia_identifier_suffix'])
         self.id_for_key = value['ia_identifier_suffix']
     if '/' not in self.id_for_key:
         self.redis_output_file_key = "%s:%s:%s:output_file" % (
             redis_key3, self.library, self.id_for_key)
     else:
         self.redis_output_file_key = "%s:%s:%s:output_file" % (
             redis_key3, self.library, hashlib.md5(
                 self.id_for_key).hexdigest())
     self.library_name = bridge.lib_module(self.library)[1]
     metadata_key = self.book_key + ":meta_data"
     metadata = redis_py.get(metadata_key, True)
     info = json.loads(metadata)
     try:
         self.title = info['title'].encode(
             "utf-8") + " " + info['subtitle'].encode("utf-8")
     except:
         self.title = str(info['title'].encode("utf-8")) + " " + str(
             info['subtitle'])
     self.author = info['author'].encode("utf-8")
     self.publisher = info['publisher'].encode("utf-8")
     self.description = info['description'].replace("\n",
                                                    "").encode("utf-8")
     self.printType = info['printType'].encode("utf-8")
     self.publishedDate = re.sub("[^0123456789/.-]", "",
                                 info['publishedDate'].encode("utf-8"))
     self.infoLink = info['infoLink']
     self.publicDomain = info['publicDomain']
     language_code = info['language'].encode("utf-8")
     if self.publishedDate not in (None, ""):
         try:
             self.publishedDate = re.sub('[x?]', '0', self.publishedDate)
             self.year = parser.parse(self.publishedDate).year
             self.month = parser.parse(self.publishedDate).month
             self.day = parser.parse(self.publishedDate).day
         except:
             self.year = ""
             self.month = ""
             self.day = ""
     else:
         self.year = ""
         self.month = ""
         self.day = ""
     try:
         self.language = lang_code(language_code)
     except:
         self.language = ""
     self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" % (
         self.library, self.Id)
     self.scanner = info['scanner']
     self.sponser = info['sponser']