async def check(self, entry): length = self._cfg.get('isbn_length', entry, 13) if not length: return [] isbn = entry.data.get('isbn') if not isbn: return [] clean_isbn = clean(isbn) if not clean_isbn or notisbn(clean_isbn): return [] if length not in (10, 13): raise ConfigurationError( "The option 'isbn_length' must be either of 10 or 13.") if length == 10: if not is_isbn10(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 10.".format(isbn), "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn))) ] elif length == 13: if not is_isbn13(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 13.".format(isbn), "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn))) ] return []
def isbn(self,isbn): #adds isbn to google spread sheet #check if valid clean_isbn = isbnlib.clean(isbn) if isbnlib.notisbn(clean_isbn): return "not valid isbn" #should check if has been collected before canonical = None; #first check trove canonical = self.trove.extract(clean_isbn); if not canonical : # try alternative isbn form print "trying alternative form " alt_isbn = clean_isbn; if isbnlib.is_isbn13(clean_isbn): alt_isbn = isbnlib.to_isbn10(clean_isbn) else : alt_isbn = isbnlib.to_isbn13(clean_isbn) canonical = self.trove.extract(alt_isbn); if canonical : clean_isbn = alt_isbn if not canonical : canonical = self.__reduce_metadata(clean_isbn,['merge','isbndb','openl']) if not canonical: return "no metadata found for isbn: " + clean_isbn canonical['source']='isbnlib' canonical["Authors"] = u', '.join(canonical["Authors"]) canonical['link']=None row_data = ['isbn:'+clean_isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']] return self.__add_and_render(row_data)
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-apikey', '--isbndbkey', help="Insert ISBNDB apikey") args = parser.parse_args() if args.isbndbkey: print(args.isbndbkey) else: print('NO MANUAL API KEY') for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format( FILETYPE, IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) apikey = args.isbndbkey if args.isbndbkey is None: apikey = 'XOATAY1G' data = 'http://isbndb.com/api/v2/json/{}/book/{}'.format( apikey, V13) v = parse(data) GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n".format( M, ISBNS, V13) APIDATA = "ISBNDB\n\n\t{}\n\n------------\n\n".format(v) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") print("ISBNDB\n\n\t", v, '\n\n') if v['error']: file.write(v['error'] + "\n") else: pass #Writing Metadata onto file2 file2.write(GCISDATA) file2.write(APIDATA) except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format( TITLE, ISBNS, IDEN) print(Error) file.write(Error)
async def check(self, entry): fmt = self._cfg.get('isbn_format', entry) if not fmt: return [] isbn = entry.data.get('isbn') if not isbn: return [] clean_isbn = clean(isbn) if not clean_isbn or notisbn(clean_isbn): return [] if fmt not in ('canonical', 'masked'): raise ConfigurationError("The option 'isbn_format' must be \ either of 'canonical' or 'masked'.") if fmt == 'canonical': cisbn = canonical(clean_isbn) if cisbn != isbn: return [(type(self).NAME, "ISBN '{}' is not in canonical format.".format(isbn), "Canonical format would be '{}'".format(cisbn))] elif fmt == 'masked': misbn = mask(clean_isbn) if misbn != isbn: return [(type(self).NAME, "ISBN '{}' is not in masked format.".format(isbn), "Masked format would be '{}'".format(misbn))] return []
def validate_book_data(book_data): """Checks to see if the given book data is valid Args: book_data (dict): The book data to validate. Required keys are 'isbn', 'title', and 'authors' Raises: InvalidRequest: If 'isbn', 'title', and 'authors' do not appear as keys in book_data InvalidRequest: If the value for key 'authors' is not a list of strings InvalidRequest: If the isbn is not valid. See https://en.wikipedia.org/wiki/International_Standard_Book_Number#Check_digits ResourceExists: If a book with the provided isbn already exists Returns: dict: The validated book data """ isbn = book_data.get("isbn") title = book_data.get("title") authors = book_data.get("authors") # Ensure request is valid format if not (title and isbn and authors): raise InvalidRequest( "Request should be of the form {{isbn: 'isbn', title: 'title', authors: [author1, author2,]}}" ) # Check if isbn is valid if not (isbn := isbnlib.to_isbn13(isbnlib.clean(isbn))): raise InvalidRequest( "The isbn provided is not valid or could not be converted into isbn-13 format" )
def get_title(isbn_numbers): for isbn in isbn_numbers: cleaned_isbn = isbnlib.clean(isbn) if (isbnlib.is_isbn13(cleaned_isbn) or isbnlib.is_isbn10(cleaned_isbn)): book = isbnlib.meta(cleaned_isbn) return book['Title'] break
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-apikey', '--isbndbkey', help = "Insert ISBNDB apikey") args = parser.parse_args() if args.isbndbkey: print(args.isbndbkey) else: print('NO MANUAL API KEY') for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) apikey = args.isbndbkey if args.isbndbkey is None: apikey = 'XOATAY1G' data = 'http://isbndb.com/api/v2/json/{}/book/{}'.format(apikey, V13) v = parse(data) GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13) APIDATA = "ISBNDB\n\n\t{}\n\n------------\n\n" .format(v) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") print ("ISBNDB\n\n\t", v, '\n\n') if v['error']: file.write(v['error']+"\n") else: pass #Writing Metadata onto file2 file2.write(GCISDATA) file2.write(APIDATA) except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN) print(Error) file.write(Error)
def isISBN(input_str): ''' Checks if the given string is an ISBN number. ''' if isbnlib.is_isbn10(input_str): return True if isbnlib.is_isbn13(input_str): return True input_str_clean = isbnlib.clean(input_str) if isbnlib.is_isbn10(input_str_clean): return True if isbnlib.is_isbn13(input_str_clean): return True return False
def update_catalog(): file = 'updatedlibrarycatalog.csv' catalog = pd.read_csv(file) regFormat = bibformatters['default'] provider = 'loc' prevMillis = int(round(time.time() * 1000)) print(catalog.shape) print() n = 0 t = 0 f = 0 for isbnNumber in catalog['ISBN']: try: finalISBN = str(int(isbnNumber)) cleanISBN = isbn.clean(finalISBN) FINAL_ISBN = cleanISBN try: try: isbnInfo = meta(cleanISBN, service=provider) regFormat(isbnInfo) except: print('Library of Congress Service Does Not Work') try: provider = 'goob' isbnInfo = meta(cleanISBN, service=provider) regFormat(isbnInfo) except: print('Google Service Does Not Work') try: provider = 'openl' isbnInfo = meta(cleanISBN, service=provider) regFormat(isbnInfo) except: print('OpenL Service Does Not Work') raise ValueError() except: print("ISBN13 Num Does Not Work") tempValue = catalog.loc[catalog['ISBN'] == isbnNumber]['ISBN_10'].values isbn10Num = tempValue[0] finalISBN10 = str(int(float(isbn10Num))) cleanISBN10 = isbn.clean(finalISBN10) FINAL_ISBN = cleanISBN10 try: provider = 'loc' isbnInfo = meta(cleanISBN10, service=provider) regFormat(isbnInfo) except: print('Library of Congress Service Does Not Work') try: provider = 'goob' isbnInfo = meta(cleanISBN10, service=provider) regFormat(isbnInfo) except: print('Google Service Does Not Work') try: isbnInfo = meta(cleanISBN10, service='openl') regFormat(isbnInfo) except: print('OpenL Service Does Not Work') websiteURL = "https://isbnsearch.org/search?s=" + cleanISBN website = urllib.request.urlopen(websiteURL) mybytes = website.read() dataFromWebsite = mybytes.decode("utf8") website.close() print(dataFromWebsite) raise ValueError() print(regFormat(isbnInfo)) n = n + 1 catalog.loc[t, 'ISBN'] = isbnNumber bookTitle = isbnInfo['Title'] catalog.loc[t, 'Title'] = bookTitle bookLanguage = isbnInfo['Language'] catalog.loc[t, 'Language'] = bookLanguage bookPublisher = isbnInfo['Publisher'] catalog.loc[t, 'Publisher'] = bookPublisher bookYear = isbnInfo['Year'] catalog.loc[t, 'Year'] = bookYear bookAuthor = isbnInfo['Authors'][0] catalog.loc[t, 'Authors'] = bookAuthor try: bookDesc = isbn.desc(FINAL_ISBN) catalog.loc[t, 'Description'] = bookDesc except: print('Could not extract book description') try: bookCover = isbn.cover(FINAL_ISBN) catalog.loc[t, 'Cover'] = bookCover['thumbnail'] except: print('Could not extract book cover link') except: print("This ISBN Number is NOT Valid") f = f + 1 t = t + 1 currentMillis = int(round(time.time() * 1000)) if currentMillis - prevMillis >= 60000: print("Saving File to Local and Remove Server") catalog.to_csv('updatedlibrarycatalog.csv') ftp.push_file_to_server() prevMillis = int(round(time.time() * 1000)) print("File has Successfully Saved") print("\n") print('Total Number of Books: ' + str(t)) print('Successful Books: ' + str(n)) print('Unsuccessful Books: ' + str(f))
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument( '-path', '--GCIS', help= "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] " ) args = parser.parse_args() GCIS = args.GCIS if GCIS is None: GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' print( 'NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT' ) GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format( FILETYPE, IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") #DBpedia ISBN formats a = ISBNS b = canonical(CISBN) c = to_isbn10(CISBN) d = hyphenate(to_isbn10(CISBN)) e = to_isbn13(CISBN) f = hyphenate(to_isbn13(CISBN)) g = V13 h = "ISBN {}".format(CISBN) i = "ISBN {}".format(canonical(CISBN)) j = "ISBN {}".format(hyphenate(to_isbn13(CISBN))) k = "ISBN {}".format(V13) l = "ISBN {}".format(to_isbn10(CISBN)) m = "ISBN {}".format(hyphenate(to_isbn10(CISBN))) tests = [a, b, c, d, e, f, g, h, i, j, k, l, m] for indie in tests: r = QUERY % indie RQUERY(r) if len(RQUERY(r)) != 0: print(RQUERY(r)) break except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format( TITLE, ISBNS, IDEN) print(Error) file.write(Error)
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-log', '--login', help="Route path to Gcis.conf YAML file") parser.add_argument( '-url', '--gcis', help= 'INSERT EITHER: https://data.globalchange.gov OR https://gcis-search-stage.jpl.net:3000' ) parser.add_argument('-name', '--username', help="Insert GCIS username") parser.add_argument('-pw', '--apikey', help="Insert GCIS username's api key") args = parser.parse_args() gcis = 'https://data.globalchange.gov' gcisdev = 'https://gcis-search-stage.jpl.net:3000' #Extracts login info from Gcis.conf if args.login: a = open(args.login, "r") list = (yaml.load(a)) diction = list[0] path = diction['url'] user = diction['userinfo'] key = diction['key'] print(path + '\n' + user + '\n' + key) else: pass if args.gcis == gcis: print(args.gcis) elif args.gcis == gcisdev: print(args.gcis) else: print('NO MANUAL ENDPOINT (Ignore if using Config file)') if args.username: print(args.username) else: print('NO MANUAL USERNAME (Ignore if using Config file)') if args.apikey: print(args.apikey) else: print('NO MANUAL API KEY (Ignore if using Config file)') #Credentials path = diction['url'] if diction['url'] is None: path = args.gcis else: path = gcisdev user = diction['userinfo'] if diction['userinfo'] is None: user = args.username key = diction['key'] if diction['key'] is None: key = args.apikey #Parses url.json# def parse(url): import requests r = requests.get(url, verify=False) JSONdict = r.json() return JSONdict GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format( FILETYPE, IDEN) #HREF for either GCIS or GCIS-DEV #HREF = '{}//{}/{}.json' .format(path, FILETYPE, IDEN) #test #HREF = 'https://gcis-search-stage.jpl.net:3000/book/305e4144-39d2-4d84-8843-3f502ab890e0.json' HREFPAR = parse(HREF) print(HREFPAR) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) #For possible future implementation of adding original isbn into the JSON dictionary. """M["isbn"] = V13 M["org_isbn"] = ISBNS""" print(M, '\n\t', "isbn_original:", ISBNS) #Posts updated JSON dictionary back into GCIS-DEV using credentials from command line arguments. s = requests.Session() s.auth = (user, key) s.headers.update({'Accept': 'application/json'}) r = s.post(HREF, data=M, verify=False) r.raise_for_status() sys.exit()
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-awsid', '--AWSAccessKeyID', help = "Insert AWS Access Key ID") parser.add_argument('-astag', '--AssociateTag', help = "Insert Amazon Associate Tag") parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ") args = parser.parse_args() GCIS = args.GCIS if args.AWSAccessKeyID: print(args.AWSAccessKeyID) else: print('NO AWS Access Key ID') if args.AssociateTag: print(args.AssociateTag) else: print('NO Amazon Associate Tag') if GCIS is None: GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT') GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) #MV13 = M["isbn"] = V13 #ORGISBN = M["org_isbn"] = ISBNS locapi = 'http://lx2.loc.gov:210/lcdb?version=1.1&operation=searchRetrieve&query=bath.isbn={}&maximumRecords=1&recordSchema=mods' .format(V13) results = xmlparse(locapi) GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13) APIDATA = "AMAZON\n\n\t{}\n\n------------\n\n" .format(results) print("GCIS-DEV\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") print('AMAZON\n\t',results) file2.write(GCISDATA) file2.write(APIDATA) except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN) print(Error) file.write(Error)
#Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV try: HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) #HREF = 'https://gcis-search-stage.jpl.net:3000/book/13b8b4fc-3de1-4bd8-82aa-7d3a6aa54ad5.json' HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) v = meta(V13, service = 'wcat', cache ='default') GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13) APIDATA = "WorldCat\n\n\t{}\n\n------------\n\n" .format(v) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") print ("WorldCat\n\n\t", v, '\n\n') file2.write(GCISDATA) file2.write(APIDATA) except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
def importar_acervo_bibliotecario(args, input_file, col_isbn, cols_fonte, cols_destino): cols_fonte = cols_fonte.split(',') cols_destino = cols_destino.split(',') if os.path.isfile(JSON_ACERVO_UNIFICADO): df_unificado = pd.read_json(JSON_ACERVO_UNIFICADO, dtype=str) else: df_unificado = pd.DataFrame(columns=COLUNAS_ACERVO_UNIFICADO) df = pd.DataFrame() extensao_arquivo = re.search('\.(.*)', input_file).group() if extensao_arquivo == '.csv': df = pd.read_csv(input_file, dtype=str) df = df[cols_fonte] elif extensao_arquivo == '.xlsx': df = pd.read_excel(input_file, dtype=str) df = df[cols_fonte] else: print( "ERRO: Arquivo a ser importado deve estar em formato CSV ou XLSX.") return if len(cols_fonte) != len(cols_destino): print( "ERRO: listas de colunas fonte e destino devem ter mesmo tamanho.") return elif len(cols_fonte) == 0: print( "ERRO: listas de colunas fonte e destino devem ter ser não vazias." ) return # verifica se colunas existem no esquema unificado. for col in cols_destino: if col not in COLUNAS_ACERVO_UNIFICADO: print("ERRO: %s não é uma coluna do esquema do acervo unificado." % col) return # verifica se colunas existem no esquema unificado. for col in cols_fonte: if col not in df.columns: print( "ERRO: %s não é uma coluna do esquema do acervo de entrada." % col) return if args.v: print("Esquema do acervo de entrada: ", cols_fonte) print("Projeção do esquema do acervo unificado: ", cols_destino) # # Cada coluna no acervo de entrada corresponde a uma coluna no acervo unificado. # O trecho a seguir renomeias as colunas do acervo de entrada usando os nomes de # colunas correspondentes no acervo unificado. Essa transformação é útil pois # facilita a atualização do acervo unificado (que é realizada no desta função). # mapeamento_colunas = dict() for j in range(len(cols_fonte)): mapeamento_colunas[cols_fonte[j]] = cols_destino[j] df.rename(columns=mapeamento_colunas, inplace=True) # Determina quais entradas são válidas e quais são inválidas no arquivo de entrada # (uma entrada é considerada válida se e somente se corresponde a um valor de ISBN válido) indices_entradas_invalidas = [] indices_entradas_validas = [] for index, row in df.iterrows(): isbn = str(row['isbn13']) isbn = il.clean(isbn) isbn = il.to_isbn13(isbn) if isbn == 'nan' or isbn is None: indices_entradas_invalidas.append(index) else: indices_entradas_validas.append(index) # Filtra arquivo de entrada. Após essa operação, o arquivo possui apenas entradas válidas. df.drop(df.index[indices_entradas_invalidas], inplace=True) # # Dada uma entrada no arquivo a ser importado, ou essa entrada já existe no arquivo unificado, ou não. # O trecho a seguir identifica isso para cada entrada válida do arquivo a ser importado. # isbns_no_acervo_unif = df_unificado['isbn13'].tolist() # 'sanity check': o acervo unificado não pode conter entradas duplicadas. if len(set(isbns_no_acervo_unif)) != len(isbns_no_acervo_unif): print("ERRO GRAVE: acervo unificado contém duplicatas!") print(len(set(isbns_no_acervo_unif)) - len(isbns_no_acervo_unif)) print([ item for item, count in collections.Counter( isbns_no_acervo_unif).items() if count > 1 ]) return isbns_no_arquivo_entrada = [] temp = df['isbn13'].tolist() for t in temp: isbn = str(t) isbn = il.clean(isbn) isbn = il.to_isbn13(isbn) isbns_no_arquivo_entrada.append(isbn) df['isbn13'] = isbns_no_arquivo_entrada df.sort_values('isbn13', inplace=True) df.drop_duplicates(subset="isbn13", inplace=True) # 'sanity check': não queremos importar a mesma entrada mais de uma vez. isbns_no_arquivo_entrada = df['isbn13'].tolist() if len(set(isbns_no_arquivo_entrada)) != len(isbns_no_arquivo_entrada): print("ERRO GRAVE: acervo de entrada contém duplicatas!") print( len(indices_entradas_validas) - len(set(isbns_no_arquivo_entrada))) print([ item for item, count in collections.Counter( isbns_no_arquivo_entrada).items() if count > 1 ]) return entradas_ja_existentes = [] entradas_novas = [] isbns_no_acervo_entrada = df['isbn13'].tolist() for index, row in df.iterrows(): isbn = str(row['isbn13']) isbn = il.clean(isbn) if il.is_isbn10(isbn): isbn13 = il.to_isbn13(isbn) if isbn13 in isbns_no_acervo_entrada: print( "AVISO: duplicata no arquivo de entrada (cadastro com ISBN10 e ISBN13): %s" % isbn) return isbn = il.to_isbn13(isbn) if isbn == 'nan' or isbn is None: continue try: indice = isbns_no_acervo_unif.index(isbn) except ValueError: indice = -1 if indice >= 0: entradas_ja_existentes.append(index) else: entradas_novas.append(index) # print('***ERROR***') # print('row:', row) # print('row[isbn13]:', row['isbn13']) # print('isbn:', isbn) # return entradas_invalidas = 0 entradas_atualizadas = 0 entradas_inseridas = 0 # cria dois novos dataframes, um para entradas novas e outro para entradas já existentes. df_inserir = df.loc[entradas_novas] df_atualizar = df.loc[entradas_ja_existentes] # # No acervo unificado, cada entrada é identificada pelo ISBN13. Portanto o trecho a seguir faz o mapeamento # dos valores de isbn provenientes do arquivo de entrada (que podem estar no formato ISBN10 ou ISBN13) para # ISBN13. # isbns_novos = df_inserir['isbn13'].tolist() isbns_existentes = df_atualizar['isbn13'].tolist() isbns_canonicos_novos = [il.to_isbn13(i) for i in isbns_novos] df_inserir['isbn13'] = isbns_canonicos_novos isbns_canonicos_existentes = [il.to_isbn13(i) for i in isbns_existentes] df_atualizar['isbn13'] = isbns_canonicos_existentes # # realiza as alterações pertinentes sobre o acervo unificado: # - entradas novas são inseridas (concat) # - entradas já existentes são atualizadas (merge e update) # df_inserir.reset_index(drop=True, inplace=True) df_atualizar.reset_index(drop=True, inplace=True) if args.v: print("Entradas atuais: ", len(df_unificado.index)) print("Novas entradas: ", len(df_inserir.index)) print("Entradas a atualizar: ", len(df_atualizar.index)) tamanho_acervo = len(df_unificado.index) print("Antes da importação, acervo unificado contém %d entrada(s)." % tamanho_acervo) df_unificado = pd.concat([df_unificado, df_inserir], axis=0, sort=False) df_unificado.reset_index(drop=True, inplace=True) result = df_unificado[['isbn13']].merge(df_atualizar, how="left") df_unificado.update(result) df_unificado.reset_index(drop=True, inplace=True) tamanho_acervo = len(df_unificado.index) print("Após a importação, acervo unificado contém %d entrada(s)." % tamanho_acervo) # df_unificado.set_index('isbn13',inplace=True) df_unificado.to_json(JSON_ACERVO_UNIFICADO)
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ") args = parser.parse_args() GCIS = args.GCIS if GCIS is None: GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT') GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") #DBpedia ISBN formats a = ISBNS b = canonical(CISBN) c = to_isbn10(CISBN) d = hyphenate(to_isbn10(CISBN)) e = to_isbn13(CISBN) f = hyphenate(to_isbn13(CISBN)) g = V13 h = "ISBN {}" .format(CISBN) i = "ISBN {}" .format(canonical(CISBN)) j = "ISBN {}" .format(hyphenate(to_isbn13(CISBN))) k = "ISBN {}" .format(V13) l = "ISBN {}" .format(to_isbn10(CISBN)) m = "ISBN {}" .format(hyphenate(to_isbn10(CISBN))) tests = [a,b,c,d,e,f,g,h,i,j,k,l,m] for indie in tests: r = QUERY % indie RQUERY(r) if len(RQUERY(r)) != 0: print(RQUERY(r)) break except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN) print(Error) file.write(Error)
for (root, dirs, files) in os.walk(book_dir): for f in files: with open(book_dir + f) as item: #print "hi" json_item = json.load(item) book_isbn = json_item['isbn'] if book_isbn is not None: if book_isbn == "None": with open("problem_book/" + str(f), 'w') as jsonFile: jsonFile.write( json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': '))) else: book_isbn = clean(book_isbn) #book_isbn = book_isbn.replace("-", "") #book_isbn = EAN13(book_isbn) if EAN13(book_isbn) != None: book_isbn = EAN13(book_isbn) json_item['isbn'] = book_isbn print json_item['isbn'] with open("isbn13_book/" + str(f), 'w') as jsonFile: jsonFile.write( json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': '))) else: with open("non13_book/" + str(f), 'w') as jsonFile:
for f in files: with open(book_dir+f) as item: json_item = json.load(item) book_isbn = json_item['isbn'] if book_isbn is not None: #No isbn if book_isbn == "None": with open("%s%s"%(isbn_none_path,str(f)),'w') as jsonFile: jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',',': '))) #problem_log.write(json_item['identifier']+"\n") no_isbn.append(json_item['identifier']) problem_count = problem_count + 1 else: book_isbn = clean(book_isbn) #ISBN-13 if EAN13(book_isbn) != None: book_isbn = EAN13(book_isbn) json_item['isbn'] = book_isbn with open("%s%s"%(isbn_13_path,str(f)), 'w') as jsonFile: jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': '))) normal_count = normal_count + 1 #ISBN-OTHER else: book_isbn = book_isbn.replace("-", "") json_item['isbn'] = book_isbn with open("%s%s"%(isbn_other_path,str(f)), 'w') as jsonFile: jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': '))) other_isbn.append(json_item['identifier']) #problem_log.write(json_item['identifier']+"\n")
import sys from shutil import copyfile print('Bavatar eBook Renamer') # Checking program invocation if len(sys.argv) != 3: print("Commandline: python rename.py ISBN Filename") else: # Recovering arguments isbn = sys.argv[1] book_filename = sys.argv[2] # Checking arguments isbn = isbnlib.to_isbn13(isbnlib.clean(isbn)) if isbn is None: print("ISBN is not a valid code. Please check the argument") else: if not os.path.isfile(book_filename): print("Filename is not a valid file. Please check the argument") else: # Get ISBN info from code meta = isbnlib.meta(isbn) print(meta) # Format new filename filename, file_extension = os.path.splitext(book_filename) title = meta['Title'] year = meta['Year']
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-log', '--login', help="Route path to Gcis.conf YAML file") parser.add_argument('-url', '--gcis', help='INSERT EITHER: https://data.globalchange.gov OR https://gcis-search-stage.jpl.net:3000') parser.add_argument('-name', '--username', help = "Insert GCIS username") parser.add_argument('-pw', '--apikey', help = "Insert GCIS username's api key") args = parser.parse_args() gcis = 'https://data.globalchange.gov' gcisdev = 'https://gcis-search-stage.jpl.net:3000' #Extracts login info from Gcis.conf if args.login: a = open(args.login, "r") list = (yaml.load(a)) diction = list[0] path = diction['url'] user = diction['userinfo'] key = diction['key'] print(path+'\n'+user+'\n'+key) else: pass if args.gcis == gcis: print(args.gcis) elif args.gcis == gcisdev: print(args.gcis) else: print('NO MANUAL ENDPOINT (Ignore if using Config file)') if args.username: print(args.username) else: print('NO MANUAL USERNAME (Ignore if using Config file)') if args.apikey: print(args.apikey) else: print('NO MANUAL API KEY (Ignore if using Config file)') #Credentials path = diction['url'] if diction['url'] is None: path = args.gcis else: path = gcisdev user = diction['userinfo'] if diction['userinfo'] is None: user = args.username key = diction['key'] if diction['key'] is None: key = args.apikey #Parses url.json# def parse(url): import requests r = requests.get(url, verify = False) JSONdict = r.json() return JSONdict GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) #HREF for either GCIS or GCIS-DEV #HREF = '{}//{}/{}.json' .format(path, FILETYPE, IDEN) #test #HREF = 'https://gcis-search-stage.jpl.net:3000/book/305e4144-39d2-4d84-8843-3f502ab890e0.json' HREFPAR = parse(HREF) print(HREFPAR) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) #For possible future implementation of adding original isbn into the JSON dictionary. """M["isbn"] = V13 M["org_isbn"] = ISBNS""" print(M, '\n\t', "isbn_original:", ISBNS) #Posts updated JSON dictionary back into GCIS-DEV using credentials from command line arguments. s = requests.Session() s.auth = ( user , key ) s.headers.update({'Accept': 'application/json'}) r = s.post(HREF, data = M , verify = False) r.raise_for_status() sys.exit()