def wait_and_add_to_queue(q_bulk_order): """Parse Id's from bulk-order queue(accepts requests from web) entry and add to mass-worker queue.""" log.write("%s Started wait_and_add_to_queue\n" % datetime.now()) log.flush() while True: info = json.loads(q_bulk_order.pop_and_remove()) ids = info[0] email = info[1] language = info[2] ids = re.findall(r'[^,(\r\n)\s]+', ids) no = len(ids) q_mass_worker = get_shortest_queue() library_id = 'gb' redis_key3 = keys.redis_key3 redis = redis_py.Redis() for book_id in ids: book_id = gb.get_id_from_string(book_id) book_key = "%s:%s:%s" % (redis_key3, library_id, book_id) book_language_key = book_key + ":language" redis_py.set(book_language_key, language, True) q_mass_worker.add(book_id) q_mass_worker.add(json.dumps((email, no))) bulk_order_log.write("%s Received %s entries from %s\n" % (datetime.now(), no, email)) bulk_order_log.flush()
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 #else: tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url == False: return 8 return 0
def store_output_file_name(Id, output_file): """Save output file name to redis-memory""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = redis_key3+":man:%s" %Id output_file_key = book_key + ":output_file" redis_py.set(output_file_key, output_file, True)
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'gal', Id_string) library_url_key = book_key + ":library_url" url = "http://gallica.bnf.fr/%s" %(Id) try: r = requests.get(url) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) rights= OAI_metadata_content("DC.rights", soup, burst=True) for i in rights: if not rights: continue if i.strip().lower().encode('utf-8') in ('domaine public', 'public domain'): return 0 return 2
def check_if_upload_ready(): redis = redis_py.Redis() redis_key2 = keys.redis_key2 lock_key1 = keys.lock_key1 q = redis_py.Queue(redis_key2) Lock = redis_py.Lock(lock_key1) while True: book_keys = q.pop(-1) if book_keys is False: time.sleep(2) continue for book_key in book_keys: uploaded = 0 ia_identifier = redis.get(book_key + ":ia_identifier") ia_identifier = json.loads(ia_identifier) if isinstance(ia_identifier, list): Lock.acquire(timeout=60 * 2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete(book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email(users_request, ia_identifier, book_key=book_key) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) continue else: r = get_ia_metadata(ia_identifier) if 'metadata' in r.keys(): if 'ocr' in r['metadata'].keys(): if r['metadata'][ 'ocr'] == 'language not currently OCRable': uploaded = 2 if 'DjVuTXT' in str(r) or 'Djvu XML' in str(r): uploaded = 1 if uploaded != 0: Lock.acquire(timeout=60 * 2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete(book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email(users_request, str(ia_identifier)) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) OCR_progress_key = book_key + ":OCR_progress" redis_py.set(OCR_progress_key, 1, True) else: continue time.sleep(2)
def __init__(self, value): """Assign variable""" redis_key3 = keys.redis_key3 self.library = 'gb' self.library_name = 'Google-Books' self.Id = value.encode('utf-8') self.ia_identifier = "bub_" + self.library + "_" + value self.book_key = "%s:%s:%s" %(redis_key3, self.library, value) self.redis = redis_py.Redis() self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, self.Id)
def metadata(Id): """Return book information and meta-data""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text if "Página não encontrada".decode('utf-8') in source: return 1 soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 thumbnail_url = extract_thumbnail_url(soup, url) return dict( image_url = thumbnail_url, thumbnail_url = thumbnail_url, printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DCTERMS.issued", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)), scanner = extract_base_domain(url), sponser = extract_base_domain(url) )
def metadata(Id): """Return book information and meta-data""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) Id_raw = get_id_from_string(Id, action = 'desanitize') Id_raw = Id_raw[:-1] if Id_raw[-1] == '/' else Id_raw library_url_key = book_key + ":library_url" url = "http://gallica.bnf.fr/%s" %(Id_raw) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) thumbnail_url = 'http://gallica.bnf.fr/%s.thumbnail' %Id_raw source = r.text soup = BeautifulSoup(source) return dict( image_url = thumbnail_url, thumbnail_url = thumbnail_url, printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DC.date", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(get_lang(source)), scanner = "Gallica", sponser = "Gallica" )
def download_book(Id, id_for_key): redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) r = requests.get(url) source = r.text soup = BeautifulSoup(source) tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url in ("", None): return 1 pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ### store_output_file_name(id_for_key, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0
def __init__(self, value): """Assign variable, and get metadata from cache""" redis_key3 = keys.redis_key3 self.redis_key3 = redis_key3 self.redis = redis_py.Redis() if isinstance(value, (int, long, float, complex)): db = mysql_py.Db() values = db.execute( 'select library, book_id from request where sno = %s;', value)[0] db.close() self.library = values[0] self.Id = values[1].encode('utf-8') self.book_key = "%s:%s:%s" % (redis_key3, self.library, self.Id) self.redis.set(redis_key3 + ":ongoing_job_identifier", self.Id) self.ia_identifier = None self.id_for_key = self.Id else: self.library = value['library'] self.Id = value['Id'] self.ia_identifier = "bub_" + self.library + "_" + value[ 'ia_identifier_suffix'] self.book_key = "%s:%s:%s" % (redis_key3, self.library, value['ia_identifier_suffix']) self.redis.set(redis_key3 + ":ongoing_job_identifier", value['ia_identifier_suffix']) self.id_for_key = value['ia_identifier_suffix'] if '/' not in self.id_for_key: self.redis_output_file_key = "%s:%s:%s:output_file" % ( redis_key3, self.library, self.id_for_key) else: self.redis_output_file_key = "%s:%s:%s:output_file" % ( redis_key3, self.library, hashlib.md5( self.id_for_key).hexdigest()) self.library_name = bridge.lib_module(self.library)[1] metadata_key = self.book_key + ":meta_data" metadata = redis_py.get(metadata_key, True) info = json.loads(metadata) try: self.title = info['title'].encode( "utf-8") + " " + info['subtitle'].encode("utf-8") except: self.title = str(info['title'].encode("utf-8")) + " " + str( info['subtitle']) self.author = info['author'].encode("utf-8") self.publisher = info['publisher'].encode("utf-8") self.description = info['description'].replace("\n", "").encode("utf-8") self.printType = info['printType'].encode("utf-8") self.publishedDate = re.sub("[^0123456789/.-]", "", info['publishedDate'].encode("utf-8")) self.infoLink = info['infoLink'] self.publicDomain = info['publicDomain'] language_code = info['language'].encode("utf-8") if self.publishedDate not in (None, ""): try: self.publishedDate = re.sub('[x?]', '0', self.publishedDate) self.year = parser.parse(self.publishedDate).year self.month = parser.parse(self.publishedDate).month self.day = parser.parse(self.publishedDate).day except: self.year = "" self.month = "" self.day = "" else: self.year = "" self.month = "" self.day = "" try: self.language = lang_code(language_code) except: self.language = "" self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" % ( self.library, self.Id) self.scanner = info['scanner'] self.sponser = info['sponser']