def upload_to_IA(self, library, Id): """Upload book to IA with appropriate metadata.""" if self.ia_identifier == None: item = self.get_valid_identifier() self.ia_identifier = item.identifier else: item = ia.get_item(self.ia_identifier) language_from_input = redis_py.get(self.book_key + ":language", True) metadata = dict( mediatype="text", creator=self.author, title=re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330], publisher=self.publisher, description=re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.description), source=self.infoLink, language=self.language if language_from_input in (None, "") else language_from_input, year=self.year, date=self.publishedDate, subject="bub_upload", licenseurl="http://creativecommons.org/publicdomain/mark/1.0/" if self.publicDomain == True else "", scanner=self.scanner, sponsor=self.sponser, uploader="bub") metadata['google-id'] = self.Id if self.library == 'gb' else "" filename = redis_py.get(self.redis_output_file_key, True) self.filename = filename S3_access_key = keys.S3_access_key S3_secret_key = keys.S3_secret_key try: status = item.upload(filename, access_key=S3_access_key, secret_key=S3_secret_key, metadata=metadata) except: item = self.get_valid_identifier(primary=False) self.ia_identifier = item.identifier status = item.upload(filename, access_key=S3_access_key, secret_key=S3_secret_key, metadata=metadata) command = "rm %s" % (filename) try: subprocess.check_call(command, shell=True) except: log.write("%s Command rm %s failed" % (datetime.now(), filename)) log.flush() return status
def verify_id(Id_string): """Verify the Id and public-domain status for the book""" Id = get_id_from_string(Id_string, 'desanitize') if Id == None: return 1 redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 #else: tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url == False: return 8 return 0
def upload_to_IA(self, library, Id): """Upload book to IA with appropriate metadata.""" if self.ia_identifier == None: item = self.get_valid_identifier() self.ia_identifier = item.identifier else: item = ia.get_item(self.ia_identifier) language_from_input = redis_py.get(self.book_key + ":language", True) metadata = dict( mediatype = "text", creator = self.author, title = re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""",'',self.title)[:330], publisher = self.publisher, description = re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""",'',self.description), source = self.infoLink, language = self.language if language_from_input in (None, "") else language_from_input, year = self.year, date = self.publishedDate, subject = "bub_upload", licenseurl = "http://creativecommons.org/publicdomain/mark/1.0/" if self.publicDomain == True else "", scanner = self.scanner, sponsor = self.sponser, uploader = "bub") metadata['google-id'] = self.Id if self.library == 'gb' else "" filename = redis_py.get(self.redis_output_file_key, True) self.filename = filename S3_access_key = keys.S3_access_key S3_secret_key = keys.S3_secret_key try: status = item.upload(filename, access_key = S3_access_key, secret_key = S3_secret_key, metadata=metadata) except: item = self.get_valid_identifier(primary = False) self.ia_identifier = item.identifier status = item.upload(filename, access_key = S3_access_key, secret_key = S3_secret_key, metadata=metadata) command = "rm %s" %(filename) try: subprocess.check_call(command, shell=True) except: log.write("%s Command rm %s failed" %(datetime.now(), filename)) log.flush() return status
def metadata(Id): """Return book information and meta-data""" redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) try: r = requests.get(url) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 else: source = r.text if "Página não encontrada".decode('utf-8') in source: return 1 soup = BeautifulSoup(source) #public_domain = OAI_metadata_content("DC.relation", soup) #if public_domain != "Domínio público".decode('utf-8') and public_domain != "": #return 2 thumbnail_url = extract_thumbnail_url(soup, url) return dict( image_url = thumbnail_url, thumbnail_url = thumbnail_url, printType = "BOOK", title = OAI_metadata_content("DC.title", soup), subtitle = "", author = OAI_metadata_content("DC.creator", soup), publisher = OAI_metadata_content("DC.publisher", soup), publishedDate = OAI_metadata_content("DCTERMS.issued", soup), description = OAI_metadata_content("DC.description", soup), infoLink = url, publicDomain = True, language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)), scanner = extract_base_domain(url), sponser = extract_base_domain(url) )
def download_book(Id, id_for_key): redis = redis_py.Redis() redis_key3 = keys.redis_key3 book_key = "%s:%s:%s" %(redis_key3, 'usp', Id) library_url_key = book_key + ":library_url" url = redis_py.get(library_url_key, True) url = get_absolute_url_of_book(url) r = requests.get(url) source = r.text soup = BeautifulSoup(source) tld = extract_base_domain(url) if tld[-1:] == '/': tld = tld[:-1] pdf_url = get_pdf_link(tld, soup) if pdf_url in ("", None): return 1 pdf = requests.get(pdf_url, stream=True) output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ### store_output_file_name(id_for_key, output_file) with open(output_file, 'wb') as f: for chunk in pdf.iter_content(1024): f.write(chunk) return 0
def stored_copy_check(self): """Check if book already uploaded by the tool.""" if redis_py.get(self.book_key + ":upload_progress", True) == '1': return True else: return None
def __init__(self, value): """Assign variable, and get metadata from cache""" redis_key3 = keys.redis_key3 self.redis_key3 = redis_key3 self.redis = redis_py.Redis() if isinstance(value, (int, long, float, complex)): db = mysql_py.Db() values = db.execute( 'select library, book_id from request where sno = %s;', value)[0] db.close() self.library = values[0] self.Id = values[1].encode('utf-8') self.book_key = "%s:%s:%s" % (redis_key3, self.library, self.Id) self.redis.set(redis_key3 + ":ongoing_job_identifier", self.Id) self.ia_identifier = None self.id_for_key = self.Id else: self.library = value['library'] self.Id = value['Id'] self.ia_identifier = "bub_" + self.library + "_" + value[ 'ia_identifier_suffix'] self.book_key = "%s:%s:%s" % (redis_key3, self.library, value['ia_identifier_suffix']) self.redis.set(redis_key3 + ":ongoing_job_identifier", value['ia_identifier_suffix']) self.id_for_key = value['ia_identifier_suffix'] if '/' not in self.id_for_key: self.redis_output_file_key = "%s:%s:%s:output_file" % ( redis_key3, self.library, self.id_for_key) else: self.redis_output_file_key = "%s:%s:%s:output_file" % ( redis_key3, self.library, hashlib.md5( self.id_for_key).hexdigest()) self.library_name = bridge.lib_module(self.library)[1] metadata_key = self.book_key + ":meta_data" metadata = redis_py.get(metadata_key, True) info = json.loads(metadata) try: self.title = info['title'].encode( "utf-8") + " " + info['subtitle'].encode("utf-8") except: self.title = str(info['title'].encode("utf-8")) + " " + str( info['subtitle']) self.author = info['author'].encode("utf-8") self.publisher = info['publisher'].encode("utf-8") self.description = info['description'].replace("\n", "").encode("utf-8") self.printType = info['printType'].encode("utf-8") self.publishedDate = re.sub("[^0123456789/.-]", "", info['publishedDate'].encode("utf-8")) self.infoLink = info['infoLink'] self.publicDomain = info['publicDomain'] language_code = info['language'].encode("utf-8") if self.publishedDate not in (None, ""): try: self.publishedDate = re.sub('[x?]', '0', self.publishedDate) self.year = parser.parse(self.publishedDate).year self.month = parser.parse(self.publishedDate).month self.day = parser.parse(self.publishedDate).day except: self.year = "" self.month = "" self.day = "" else: self.year = "" self.month = "" self.day = "" try: self.language = lang_code(language_code) except: self.language = "" self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" % ( self.library, self.Id) self.scanner = info['scanner'] self.sponser = info['sponser']
def __init__(self, value): """Assign variable, and get metadata from cache""" redis_key3 = keys.redis_key3 self.redis_key3 = redis_key3 self.redis = redis_py.Redis() if isinstance(value, (int, long, float, complex)): db = mysql_py.Db() values = db.execute('select library, book_id from request where sno = %s;',value)[0] db.close() self.library = values[0] self.Id = values[1].encode('utf-8') self.book_key = "%s:%s:%s" %(redis_key3, self.library, self.Id) self.redis.set(redis_key3+":ongoing_job_identifier", self.Id) self.ia_identifier = None self.id_for_key = self.Id else: self.library = value['library'] self.Id = value['Id'] self.ia_identifier = "bub_" + self.library + "_" + value['ia_identifier_suffix'] self.book_key = "%s:%s:%s" %(redis_key3, self.library, value['ia_identifier_suffix']) self.redis.set(redis_key3+":ongoing_job_identifier", value['ia_identifier_suffix']) self.id_for_key = value['ia_identifier_suffix'] if '/' not in self.id_for_key: self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, self.id_for_key) else: self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, hashlib.md5(self.id_for_key).hexdigest()) self.library_name = bridge.lib_module(self.library)[1] metadata_key = self.book_key + ":meta_data" metadata = redis_py.get(metadata_key, True) info = json.loads(metadata) try: self.title = info['title'].encode("utf-8") + " " + info['subtitle'].encode("utf-8") except: self.title = str(info['title'].encode("utf-8")) + " " + str(info['subtitle']) self.author = info['author'].encode("utf-8") self.publisher = info['publisher'].encode("utf-8") self.description = info['description'].replace("\n", "").encode("utf-8") self.printType = info['printType'].encode("utf-8") self.publishedDate = re.sub("[^0123456789/.-]","", info['publishedDate'].encode("utf-8")) self.infoLink = info['infoLink'] self.publicDomain = info['publicDomain'] language_code = info['language'].encode("utf-8") if self.publishedDate not in (None,"") : try: self.publishedDate = re.sub('[x?]','0',self.publishedDate) self.year = parser.parse(self.publishedDate).year self.month = parser.parse(self.publishedDate).month self.day = parser.parse(self.publishedDate).day except: self.year = "" self.month = "" self.day = "" else: self.year = "" self.month = "" self.day = "" try: self.language = lang_code(language_code) except: self.language = "" self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" %(self.library, self.Id) self.scanner = info['scanner'] self.sponser = info['sponser']