def store_output_file_name(Id, output_file): """Save output file name to redis-memory""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = redis_key3+":mdc:%s" %Id output_file_key = book_key + ":output_file" redis_py.set(output_file_key, output_file, True)
def wait_and_add_to_queue(q_bulk_order): """Parse Id's from bulk-order queue(accepts requests from web) entry and add to mass-worker queue.""" log.write("%s Started wait_and_add_to_queue\n" % datetime.now()) log.flush() while True: info = json.loads(q_bulk_order.pop_and_remove()) ids = info[0] email = info[1] language = info[2] ids = re.findall(r'[^,(\r\n)\s]+', ids) no = len(ids) q_mass_worker = get_shortest_queue() library_id = 'gb' redis_key3 = keys.redis_key3 redis = redis_py.Redis() for book_id in ids: book_id = gb.get_id_from_string(book_id) book_key = "%s:%s:%s" % (redis_key3, library_id, book_id) book_language_key = book_key + ":language" redis_py.set(book_language_key, language, True) q_mass_worker.add(book_id) q_mass_worker.add(json.dumps((email, no))) bulk_order_log.write("%s Received %s entries from %s\n" % (datetime.now(), no, email)) bulk_order_log.flush()
def store_output_file_name(Id, output_file): """Save output file name to redis-memory""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = redis_key3+":man:%s" %Id output_file_key = book_key + ":output_file" redis_py.set(output_file_key, output_file, True)
def set_metadata(self): """Get metadata, and save it to memory. Return 0 on success, or return the error_status of library module.""" metadata_key = self.book_key + ":metadata" metadata = gb.metadata(self.Id) if isinstance(metadata, (int, long, float, complex)): error_status = metadata return error_status info = metadata metadata = json.dumps(metadata) redis_py.set(metadata_key, metadata, True) try: self.title = info['title'].encode( "utf-8") + " " + info['subtitle'].encode("utf-8") except: self.title = str(info['title']) + " " + str(info['subtitle']) self.author = info['author'].encode("utf-8") self.publisher = info['publisher'].encode("utf-8") self.description = info['description'].replace("\n", "").encode("utf-8") self.printType = info['printType'].encode("utf-8") self.publishedDate = re.sub("[^0123456789/.-]", "", info['publishedDate'].encode("utf-8")) self.infoLink = info['infoLink'] self.publicDomain = info['publicDomain'] language_code = info['language'].encode("utf-8") if self.publishedDate not in (None, ""): self.publishedDate = re.sub('[x?]', '0', self.publishedDate) self.year = parser.parse(self.publishedDate).year self.month = parser.parse(self.publishedDate).month self.day = parser.parse(self.publishedDate).day else: self.year = "" self.month = "" self.day = "" try: self.language = lang_code(language_code) except: self.language = "" self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" % ( self.library, self.Id) self.scanner = info['scanner'] self.sponser = info['sponser'] return 0
def manager(q): while True: value = q.pop_and_remove() ia_w = IaWorker(value) if isinstance(value, (int, long, float, complex)): ia_identifier_found = ia_w.check_in_IA(ia_w.library, ia_w.Id) if ia_identifier_found is not False: ia_w.submit_OCR_wait_job(ia_identifier_found) continue else: ia_response_key = ia_w.book_key + ":ia_response" redis_py.set(ia_response_key, 3, True) if not os.path.isfile(ia_w.pdf_path): download_status = bridge.download_book(ia_w.library, ia_w.Id, ia_w.id_for_key) if download_status != 0: log.write("%s Download Error, library:%s, ID:%s\n" %(datetime.now(), ia_w.library, ia_w.Id) ) log.flush() continue download_progress_key = ia_w.book_key + ":download_progress" redis_py.set(download_progress_key, 1, True) upload_status = ia_w.upload_to_IA(ia_w.library, ia_w.Id) if str(upload_status) == "[<Response [200]>]": upload_progress_key = ia_w.book_key + ":upload_progress" redis_py.set(upload_progress_key, 1, True) ia_w.submit_OCR_wait_job(ia_w.ia_identifier) ia_w.redis.delete(ia_w.redis_key3+":ongoing_job_identifier")
def manager(q): while True: value = q.pop_and_remove() ia_w = IaWorker(value) if isinstance(value, (int, long, float, complex)): ia_identifier_found = ia_w.check_in_IA(ia_w.library, ia_w.Id) if ia_identifier_found is not False: ia_w.submit_OCR_wait_job(ia_identifier_found) continue else: ia_response_key = ia_w.book_key + ":ia_response" redis_py.set(ia_response_key, 3, True) if not os.path.isfile(ia_w.pdf_path): download_status = bridge.download_book(ia_w.library, ia_w.Id, ia_w.id_for_key) if download_status != 0: log.write("%s Download Error, library:%s, ID:%s\n" % (datetime.now(), ia_w.library, ia_w.Id)) log.flush() continue download_progress_key = ia_w.book_key + ":download_progress" redis_py.set(download_progress_key, 1, True) upload_status = ia_w.upload_to_IA(ia_w.library, ia_w.Id) if str(upload_status) == "[<Response [200]>]": upload_progress_key = ia_w.book_key + ":upload_progress" redis_py.set(upload_progress_key, 1, True) ia_w.submit_OCR_wait_job(ia_w.ia_identifier) ia_w.redis.delete(ia_w.redis_key3 + ":ongoing_job_identifier")
def set_metadata(self): """Get metadata, and save it to memory. Return 0 on success, or return the error_status of library module.""" metadata_key = self.book_key + ":metadata" metadata = gb.metadata(self.Id) if isinstance(metadata, (int, long, float, complex)): error_status = metadata return error_status info = metadata metadata = json.dumps(metadata) redis_py.set(metadata_key, metadata, True) try: self.title = info['title'].encode("utf-8") + " " + info['subtitle'].encode("utf-8") except: self.title = str(info['title']) + " " + str(info['subtitle']) self.author = info['author'].encode("utf-8") self.publisher = info['publisher'].encode("utf-8") self.description = info['description'].replace("\n", "").encode("utf-8") self.printType = info['printType'].encode("utf-8") self.publishedDate = re.sub("[^0123456789/.-]","", info['publishedDate'].encode("utf-8")) self.infoLink = info['infoLink'] self.publicDomain = info['publicDomain'] language_code = info['language'].encode("utf-8") if self.publishedDate not in (None,"") : self.publishedDate = re.sub('[x?]','0',self.publishedDate) self.year = parser.parse(self.publishedDate).year self.month = parser.parse(self.publishedDate).month self.day = parser.parse(self.publishedDate).day else: self.year = "" self.month = "" self.day = "" try: self.language = lang_code(language_code) except: self.language = "" self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" %(self.library, self.Id) self.scanner = info['scanner'] self.sponser = info['sponser'] return 0
def check_if_upload_ready(): redis = redis_py.Redis() redis_key2 = keys.redis_key2 lock_key1 = keys.lock_key1 q = redis_py.Queue(redis_key2) Lock = redis_py.Lock(lock_key1) while True: book_keys = q.pop(-1) if book_keys is False: time.sleep(2) continue for book_key in book_keys: uploaded = 0 ia_identifier = redis.get(book_key + ":ia_identifier") ia_identifier = json.loads(ia_identifier) if isinstance(ia_identifier, list): Lock.acquire(timeout=60 * 2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete(book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email(users_request, ia_identifier, book_key=book_key) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) continue else: r = get_ia_metadata(ia_identifier) if 'metadata' in r.keys(): if 'ocr' in r['metadata'].keys(): if r['metadata'][ 'ocr'] == 'language not currently OCRable': uploaded = 2 if 'DjVuTXT' in str(r) or 'Djvu XML' in str(r): uploaded = 1 if uploaded != 0: Lock.acquire(timeout=60 * 2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete(book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email(users_request, str(ia_identifier)) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) OCR_progress_key = book_key + ":OCR_progress" redis_py.set(OCR_progress_key, 1, True) else: continue time.sleep(2)
def check_if_upload_ready(): redis = redis_py.Redis() redis_key2 = keys.redis_key2 lock_key1 = keys.lock_key1 q = redis_py.Queue(redis_key2) Lock = redis_py.Lock(lock_key1) while True: book_keys = q.pop(-1) if book_keys is False: time.sleep(2) continue for book_key in book_keys: uploaded = 0 ia_identifier = redis.get(book_key + ":ia_identifier") ia_identifier = json.loads(ia_identifier) if isinstance(ia_identifier, list): Lock.acquire(timeout = 60*2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete( book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email( users_request, ia_identifier, book_key = book_key ) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) continue else: r = get_ia_metadata(ia_identifier) if 'metadata' in r.keys(): if 'ocr' in r['metadata'].keys(): if r['metadata']['ocr'] == 'language not currently OCRable': uploaded = 2 if 'DjVuTXT' in str(r) or 'Djvu XML' in str(r): uploaded = 1 if uploaded != 0: Lock.acquire(timeout = 60*2) users_request = redis_py.smembers(book_key + ":requests", True) if users_request != None: redis.delete( book_key + ":requests") remove_request_db(users_request, book_key) remove_from_db(users_request) Lock.release() q.remove(book_key) if users_request != None: send_email( users_request, str(ia_identifier) ) email_progress_key = book_key + ":email_progress" redis_py.set(email_progress_key, 1, True) delete_from_global_queue(book_key) OCR_progress_key = book_key + ":OCR_progress" redis_py.set(OCR_progress_key, 1, True) else: continue time.sleep(2)
def wait_and_add_to_queue(q_bulk_order): """Parse Id's from bulk-order queue(accepts requests from web) entry and add to mass-worker queue.""" log.write("%s Started wait_and_add_to_queue\n" %datetime.now()) log.flush() while True: info = json.loads(q_bulk_order.pop_and_remove()) ids = info[0] email = info[1] language = info[2] ids = re.findall(r'[^,(\r\n)\s]+', ids) no=len(ids) q_mass_worker = get_shortest_queue() library_id = 'gb' redis_key3 = keys.redis_key3 redis = redis_py.Redis() for book_id in ids: book_id = gb.get_id_from_string(book_id) book_key = "%s:%s:%s" %(redis_key3, library_id, book_id) book_language_key = book_key + ":language" redis_py.set(book_language_key, language, True) q_mass_worker.add(book_id) q_mass_worker.add( json.dumps((email, no)) ) bulk_order_log.write("%s Received %s entries from %s\n" %(datetime.now(), no, email)) bulk_order_log.flush()
def manager(q_mass_worker): db = mysql_py.Db() while True: book_id = False while book_id == False: book_id = q_mass_worker.pop_and_remove(wait_type='nonblocking') if book_id == False: book_id = get_id_from_another_worker(mass_worker_key) if book_id == False: time.sleep(1) try: book_id = json.loads(book_id) except ValueError: pass if isinstance(book_id, list): email = book_id[0] no_of_uploads = book_id[1] if email not in (None, ""): send_email(email, no_of_uploads) continue ia_w = IaWorker(book_id) stored_identifier = ia_w.stored_copy_check() if stored_identifier != None: continue db = ping_db(db) md5_book = hashlib.md5(ia_w.Id + ia_w.library).hexdigest() redundancy_book = db.execute( "select count(*) from request where md5_book=%s and confirmed=1 and job_submitted=1;", md5_book) if redundancy_book[0][0] != 0: continue metadata_status = ia_w.set_metadata() if isinstance(metadata_status, (int, long, float, complex)): if metadata_status == 7: log.write( '%s %s API limit exceeded Sleeping with book_id:%s\n' % (datetime.now(), __worker_name, book_id)) log.flush() time.sleep(seconds_until_google_quota_refresh()) q_mass_worker.add(book_id) continue elif metadata_status == 3: log.write("%s %s Book download restricted\n" % (datetime.now(), book_id)) log.flush() continue elif metadata_status == 2: log.write("%s %s Not Public Domain\n" % (datetime.now(), book_id)) log.flush() continue elif metadata_status == 0: pass else: log.write( "%s Metadata Error, library:%s, ID:%s, status:%s\n" % (datetime.now(), 'gb', book_id, metadata_status)) log.flush() continue """ ia_identifier_found = ia_w.check_in_IA(ia_w.library, ia_w.Id) if ia_identifier_found is not False: ia_w.save_ia_identifier(ia_identifier_found) continue """ if not os.path.isfile(ia_w.pdf_path): download_status = gb.download_book(ia_w.Id) if download_status != 0: log.write("%s Download Error, library:%s, ID:%s\n" % (datetime.now(), 'gb', book_id)) log.flush() continue download_progress_key = ia_w.book_key + ":download_progress" redis_py.set(download_progress_key, 1, True) try: upload_status = ia_w.upload_to_IA(ia_w.library, ia_w.Id) if str(upload_status) == "[<Response [200]>]": upload_progress_key = ia_w.book_key + ":upload_progress" redis_py.set(upload_progress_key, 1, True) ia_w.save_ia_identifier(ia_w.ia_identifier) except: filename = ia_w.filename command = "rm %s" % (filename) try: subprocess.check_call(command, shell=True) except: log.write("%s Command rm %s failed" % (datetime.now(), filename)) log.flush()
def save_ia_identifier(self, value): """Save Ia-Identifier for caching purpose.""" redis_key3 = keys.redis_key3 key_ia_identifier = self.book_key + ":ia_identifier" value = json.dumps(value) redis_py.set(key_ia_identifier, value, True)
def check_in_IA(self, library, Id): """Check if book present in IA. Return False if not present else Return Identifier(s)""" url = """http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus( re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330]) r = requests.get(url) ia_info = r.json() numFound = int(ia_info['response']['numFound']) if numFound > 20: numFound = 20 if numFound == 0: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False match_list = [] year_present = 0 self.magazine = 0 for i in range(numFound): match_score = 0 creator_present = 0 if 'source' in ia_info['response']['docs'][i].keys( ) and self.Id not in (None, ""): source = ia_info['response']['docs'][i]['source'].encode( "utf-8") if self.Id in source: match_score += 20 if 'title' in ia_info['response']['docs'][i].keys( ) and self.title not in (None, ""): title = ia_info['response']['docs'][i]['title'].encode("utf-8") title_similarity = difflib.SequenceMatcher( None, self.title.lower(), title.lower()).ratio() match_score += 50 * title_similarity if 'date' in ia_info['response']['docs'][i].keys(): if parser.parse(ia_info['response']['docs'][i] ['date']).year == self.year: if self.printType != 'MAGAZINE': match_score += 25 year_present = 1 else: self.magazine = 1 if parser.parse(ia_info['response']['docs'][i] ['date']).month == self.month: if parser.parse(ia_info['response']['docs'][i] ['date']).day == self.day: match_score += 25 if 'creator' in ia_info['response']['docs'][i].keys( ) and self.author not in (None, ""): creator = ia_info['response']['docs'][i]['creator'][0].encode( "utf-8") creator_similarity = difflib.SequenceMatcher( None, self.author.lower(), creator.lower()).ratio() match_score += 12 * creator_similarity creator_present = 1 if 'publisher' in ia_info['response']['docs'][i].keys( ) and self.publisher not in (None, ""): publisher = ia_info['response']['docs'][i]['publisher'][ 0].encode("utf-8") publisher_similarity = difflib.SequenceMatcher( None, self.publisher.lower(), publisher.lower()).ratio() match_score += 6 * publisher_similarity if 'language' in ia_info['response']['docs'][i].keys( ) and self.language not in (None, ""): l = ia_info['response']['docs'][i]['language'][0].encode( "utf-8") if len(l) < 5: try: language = lang_code(l) except: language = l else: language = l lang_similarity = difflib.SequenceMatcher( None, self.language.lower(), language.lower()).ratio() match_score += 3 * lang_similarity if self.magazine == 0: threshold_score = (0.7) * 80 + (25) * year_present + ( 1 - year_present) * ((0.5) * 12 * creator_present + (0.7) * 6 * (1 - creator_present)) else: threshold_score = (0.7) * 80 + 25 match_list.append(ia_info['response']['docs'][i]['identifier']) if match_list != []: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 1, True) return match_list ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False
def check_in_IA(self, library, Id): """Check if book present in IA. Return False if not present else Return Identifier(s)""" url="""http://archive.org/advancedsearch.php?q=title%%3A(%s)+AND+mediatype%%3A(texts)&fl[]=creator&fl[]=source&fl[]=date&fl[]=identifier&fl[]=language&fl[]=publisher&fl[]=title&sort[]=&sort[]=&sort[]=&rows=20&page=1&output=json""" % quote_plus(re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330]) r = requests.get(url) ia_info = r.json() numFound = int(ia_info['response']['numFound']) if numFound > 20: numFound = 20 if numFound == 0: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False match_list = [] year_present = 0 self.magazine = 0 for i in range(numFound): match_score = 0 creator_present = 0 #print "index: %s\n" %i if 'source' in ia_info['response']['docs'][i].keys() and self.Id not in (None, ""): source = ia_info['response']['docs'][i]['source'].encode("utf-8") #print "source: %s" %source if self.Id in source: match_score += 20 if 'title' in ia_info['response']['docs'][i].keys() and self.title not in (None, ""): title = ia_info['response']['docs'][i]['title'].encode("utf-8") title_similarity = difflib.SequenceMatcher(None, self.title.lower(), title.lower()).ratio() match_score += 50*title_similarity if 'date' in ia_info['response']['docs'][i].keys(): if parser.parse( ia_info['response']['docs'][i]['date'] ).year == self.year: if self.printType != 'MAGAZINE': match_score += 25 year_present = 1 else: self.magazine = 1 if parser.parse( ia_info['response']['docs'][i]['date'] ).month == self.month: if parser.parse( ia_info['response']['docs'][i]['date'] ).day == self.day: match_score += 25 if 'creator' in ia_info['response']['docs'][i].keys() and self.author not in (None, ""): creator = ia_info['response']['docs'][i]['creator'][0].encode("utf-8") creator_similarity = difflib.SequenceMatcher(None, self.author.lower(), creator.lower()).ratio() match_score += 12*creator_similarity creator_present = 1 if 'publisher' in ia_info['response']['docs'][i].keys() and self.publisher not in (None, ""): publisher = ia_info['response']['docs'][i]['publisher'][0].encode("utf-8") publisher_similarity = difflib.SequenceMatcher(None, self.publisher.lower(), publisher.lower()).ratio() match_score += 6*publisher_similarity if 'language' in ia_info['response']['docs'][i].keys() and self.language not in (None, ""): l = ia_info['response']['docs'][i]['language'][0].encode("utf-8") if len(l) < 5: try: language = lang_code(l) except: language = l else: language = l lang_similarity = difflib.SequenceMatcher(None, self.language.lower(), language.lower()).ratio() match_score += 3*lang_similarity if self.magazine == 0: threshold_score = (0.7)*80 + (25)*year_present + (1 - year_present)*((0.5)*12*creator_present + (0.7)*6*(1-creator_present)) else: threshold_score = (0.7)*80 + 25 match_list.append(ia_info['response']['docs'][i]['identifier']) if match_list != []: ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 1, True) return match_list ia_response_key = self.book_key + ":ia_response" redis_py.set(ia_response_key, 0, True) return False
def manager(q_mass_worker): db = mysql_py.Db() while True: book_id = False while book_id == False: book_id = q_mass_worker.pop_and_remove(wait_type = 'nonblocking') if book_id == False: book_id = get_id_from_another_worker(mass_worker_key) if book_id == False: time.sleep(1) try: book_id = json.loads(book_id) except ValueError: pass if isinstance(book_id, list ): email = book_id[0] no_of_uploads = book_id[1] if email not in (None, ""): send_email(email, no_of_uploads) continue ia_w = IaWorker(book_id) stored_identifier = ia_w.stored_copy_check() if stored_identifier != None: continue db = ping_db(db) md5_book = hashlib.md5(ia_w.Id + ia_w.library).hexdigest() redundancy_book = db.execute("select count(*) from request where md5_book=%s and confirmed=1 and job_submitted=1;",md5_book) if redundancy_book[0][0] != 0: continue metadata_status = ia_w.set_metadata() if isinstance(metadata_status, (int, long, float, complex)): if metadata_status == 7: log.write('%s %s API limit exceeded Sleeping with book_id:%s\n' %(datetime.now(), __worker_name, book_id)) log.flush() time.sleep(seconds_until_google_quota_refresh()) q_mass_worker.add(book_id) continue elif metadata_status == 3: log.write("%s %s Book download restricted\n" %(datetime.now(), book_id)) log.flush() continue elif metadata_status == 2: log.write("%s %s Not Public Domain\n" %(datetime.now(), book_id)) log.flush() continue elif metadata_status == 0: pass else: log.write("%s Metadata Error, library:%s, ID:%s, status:%s\n" %(datetime.now(), 'gb', book_id, metadata_status) ) log.flush() continue """ ia_identifier_found = ia_w.check_in_IA(ia_w.library, ia_w.Id) if ia_identifier_found is not False: ia_w.save_ia_identifier(ia_identifier_found) continue """ if not os.path.isfile(ia_w.pdf_path): download_status = gb.download_book(ia_w.Id) if download_status != 0: log.write("%s Download Error, library:%s, ID:%s\n" %(datetime.now(), 'gb', book_id) ) log.flush() continue download_progress_key = ia_w.book_key + ":download_progress" redis_py.set(download_progress_key, 1, True) try: upload_status = ia_w.upload_to_IA(ia_w.library, ia_w.Id) if str(upload_status) == "[<Response [200]>]": upload_progress_key = ia_w.book_key + ":upload_progress" redis_py.set(upload_progress_key, 1, True) ia_w.save_ia_identifier(ia_w.ia_identifier) except: filename = ia_w.filename command = "rm %s" %(filename) try: subprocess.check_call(command, shell=True) except: log.write("%s Command rm %s failed" %(datetime.now(), filename)) log.flush()