async def check(self, entry): length = self._cfg.get('isbn_length', entry, 13) if not length: return [] isbn = entry.data.get('isbn') if not isbn: return [] clean_isbn = clean(isbn) if not clean_isbn or notisbn(clean_isbn): return [] if length not in (10, 13): raise ConfigurationError( "The option 'isbn_length' must be either of 10 or 13.") if length == 10: if not is_isbn10(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 10.".format(isbn), "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn))) ] elif length == 13: if not is_isbn13(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 13.".format(isbn), "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn))) ] return []
def extractISBN(self): isbn = None; rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(self.pdf, set(), maxpages=0, password="",caching=True, check_extractable=True): # Get the text from the page interpreter.process_page(page) text = retstr.getvalue() retstr.truncate(0) # Extract ISBN isbn = self.searchCodeInPage(text) if isbn: break device.close() retstr.close() # Convert to ISBN 10 and 13 if isbnlib.is_isbn10(isbn): self.isbn10 = isbn self.isbn13 = isbnlib.to_isbn13(self.isbn10) elif isbnlib.is_isbn13(isbn): self.isbn13 = isbn self.isbn10 = isbnlib.to_isbn10(self.isbn13)
def format_data(termname): file_names = glob.glob('../terms/' + termname + '/course_books/*.json') dic = {} good_urls = json.loads(open('../good_urls.json', 'r').read()) bad_urls = json.loads(open('../bad_urls.json', 'r').read()) for f in file_names: with open(f) as input: data = json.loads(input.read()) for d in data: ret = [] for i in range(len(d['isbns'])): number = isbnlib.to_isbn10(d['isbns'][i]) name = ','.join(d['names'][i].split(',')[:-1]) + ', Isbn: ' + d['isbns'][i] if number: url ='http://www.amazon.com/gp/product/' + number + '/ref=as_li_tl?ie=UTF8&camp=1789&creative=9325&creativeASIN=' + number + '&linkCode=as2&tag=mocksched-20&linkId=EMBDL7BV7IXRB44G' url_for_checking = 'http://www.amazon.com/gp/product/' + number + '/ref=as_li_tl?ie=UTF8&camp=1789&creative=9325&creativeASIN=' + number + '&linkCode=as2&linkId=EMBDL7BV7IXRB44G' if url in good_urls: ret.append([d['status'][i],url,name]) elif url in bad_urls: ret.append([d['status'][i],'',name]) elif check_url(url_for_checking): ret.append([d['status'][i],'',name]) bad_urls.append(url) else: ret.append([d['status'][i],url,name]) good_urls.append(url) else: ret.append([d['status'][i],'',name]) dic[d['title']] = ret with open('../terms/' + termname + '/books.json','w') as output: json.dump(dic,output) with open('../good_urls.json','w') as good: json.dump(good_urls,good) with open('../bad_urls.json','w') as bad: json.dump(bad_urls,bad)
def isbn(self,isbn): #adds isbn to google spread sheet #check if valid clean_isbn = isbnlib.clean(isbn) if isbnlib.notisbn(clean_isbn): return "not valid isbn" #should check if has been collected before canonical = None; #first check trove canonical = self.trove.extract(clean_isbn); if not canonical : # try alternative isbn form print "trying alternative form " alt_isbn = clean_isbn; if isbnlib.is_isbn13(clean_isbn): alt_isbn = isbnlib.to_isbn10(clean_isbn) else : alt_isbn = isbnlib.to_isbn13(clean_isbn) canonical = self.trove.extract(alt_isbn); if canonical : clean_isbn = alt_isbn if not canonical : canonical = self.__reduce_metadata(clean_isbn,['merge','isbndb','openl']) if not canonical: return "no metadata found for isbn: " + clean_isbn canonical['source']='isbnlib' canonical["Authors"] = u', '.join(canonical["Authors"]) canonical['link']=None row_data = ['isbn:'+clean_isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']] return self.__add_and_render(row_data)
def extract_identifiers_from_row(row, isbn_columns): cols = [int(x) for x in isbn_columns.split(',')] isbns = set() for isbn_column in cols: raw = row[isbn_column].strip('"=') isbns.add(raw) # Transform to ISBN 10 or 13. if isbnlib.is_isbn13(raw): isbns.add(isbnlib.to_isbn10(raw)) elif isbnlib.is_isbn10(raw): isbns.add(isbnlib.to_isbn13(raw)) return isbns
def isbn_to_asin(isbn): # returns isbn10 (asin) clean = isbnlib.canonical(isbn) if (len(isbn) == 10): if (isbnlib.is_isbn10(clean)): return clean else: return '0' elif (len(isbn) == 13): if (isbnlib.is_isbn13(clean)): return isbnlib.to_isbn10(clean) else: return '0'
def add_book(): form = AddBookForm() if form.validate_on_submit(): isbn = request.form['isbn'] if isbnlib.is_isbn13(isbn): isbn = isbnlib.to_isbn10(isbn) if not isbnlib.is_isbn10(isbn): flash('Enter valid ISBN', 'error') return redirect(url_for('admin_page.add_book')) book_data = isbnlib.meta(isbn) book_cover = isbnlib.cover(isbn)
def converter(): if request.method == 'POST': ISBN_13 = isbnlib.canonical(request.form['ISBN-13']) ISBN_10 = isbnlib.canonical(request.form['ISBN-10']) #converts ISBNS flash(isbnlib.to_isbn13(ISBN_10)) flash(isbnlib.to_isbn10(ISBN_13)) return redirect(url_for('book.converter')) return render_template('book/converter.html')
def is_isbn_code(search): """checks if the received string is valid isbn number""" check = ''.join(ch for ch in search if ch.isalnum()) if is_isbn13(check): return to_isbn10(check) if is_isbn10(check): return check else: return False
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=argparse.FileType("r")) args = parser.parse_args() temp = [] for item in yaml.load(args.input): store = OrderedDict([ ("id", item["id"]), ("title", item["title"]["name"]), ("links", []), ]) for identifier in item["identifier"]: if identifier["domain"] != "isbn": continue store["links"].append({ "domain": "amazon.co.jp", "url": "https://www.amazon.co.jp/dp/{}".format( isbnlib.to_isbn10(identifier["id"])), "id": isbnlib.to_isbn10(identifier["id"]), }) store["links"].append({ "domain": "kindle.amazon.co.jp", "url": "https://www.amazon.co.jp/dp/", "id": "", }) temp.append(store) print( yaml.dump(temp, allow_unicode=True, default_flow_style=False, indent=1)) return 0
def update_cover(self): image = requests.get( 'http://images.amazon.com/images/P/%s.01._SS500_SCLZZZZZZZ_.jpg' % (isbnlib.to_isbn10(str(self.isbn)), )) if image.status_code == 200 and len(image.content) > 50: img_temp = NamedTemporaryFile(delete=True) img_temp.write(image.content) img_temp.flush() self.cover.save('%s.jpg' % (self.isbn, ), File(img_temp)) else: self.cover.delete()
def tupleData(isbn, data, quan, cond, pallet, lot): temp = () tempList = list(temp) for value in data.values(): tempList.append(value) tempList.append(isbnlib.to_isbn10(isbn)) # Add ISBN 10 tempList.append(quan) # Add the quantity of the book tempList.append(cond) # Add the condition of the book tempList.append(lot) # Add the lot number tempList.append(pallet) # Add the pallet number #tempList.append(isbnlib.cover(isbn)) # Add cover of ISBN temp = tuple(tempList) return temp
def query(isbn): """Query the BnF Catalogue Général service for metadata.""" # Quirk (see issue #1) isbn10 = to_isbn10(isbn) if isbn10: isbn_query = "(bib.isbn%20all%20%22{isbn10}%22%20or%20bib.isbn%20"\ "all%20%22{isbn}%22)".format(isbn10=isbn10, isbn=isbn) else: isbn_query = "bib.isbn%20all%20%22{isbn}%22".format(isbn=isbn) data = wquery(SERVICE_URL.format(isbn=isbn_query), user_agent=UA, parser=parser_bnf) if not data: # pragma: no cover LOGGER.debug('No data from BnF Catalogue Général for isbn %s', isbn) return {} return _mapper(isbn, data)
def test_lookup_by_isbn10_is_invalid(self): # translation table of checkdigits to wrong ones (digit plus 1) tr_table = dict( list( zip( ["x", "X"] + list(map(str, list(range(9, -1, -1)))), ["0", "0", "x"] + list(map(str, list(range(9, 0, -1)))), ) ) ) random_item = random.sample(list(Title.select("isbn RLIKE '^[0-9]{13}$'")), 1)[ 0 ] wrong_isbn = isbnlib.to_isbn10(random_item.isbn) wrong_isbn = wrong_isbn[0:9] + tr_table[wrong_isbn[9]] with self.assertRaises((isbnlib.NotValidISBNError, isbnlib._exceptions.NotValidISBNError)): result = inventory.lookup_by_isbn(wrong_isbn)
def get_isbn10(reference, verbose=False): isbn10_array = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["isbn10", "isbn-10", "isbn 10"]): if iden["identifier"] not in isbn10_array: isbn10_array.append(iden["identifier"]) if isbn10_array: return isbn10_array for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["isbn13", "isbn-13", "isbn 13"]): try: isbn10_temp = isbnlib.to_isbn10(iden["identifier"]) if isbn10_temp not in isbn10_array: isbn10_array.append(isbn10_temp) gnomics.objects.reference.Reference.add_identifier(reference, identifier=isbn10_temp, identifier_type="ISBN-10", source="ISBNlib", language=None) except: if verbose: print("No corresponding ISBN-10 found.") for obj in gnomics.reference.Reference.openlibrary(ref): if obj["isbn_10"] not in isbn10_array: isbn10_array.append(obj["isbn_10"]) gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None) for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["openlibrary", "openlibrary id", "openlibrary identifier", "olid"]): for obj in gnomics.reference.Reference.openlibrary(ref): if obj["isbn_10"] not in isbn10_array: isbn10_array.append(obj["isbn_10"]) gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None) for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["lccn", "library of congress control number"]): for obj in gnomics.reference.Reference.openlibrary(ref): if obj["isbn_10"] not in isbn10_array: isbn10_array.append(obj["isbn_10"]) gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None) for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["oclc", "oclc number", "oclc control number"]): for obj in gnomics.reference.Reference.openlibrary(ref): if obj["isbn_10"] not in isbn10_array: isbn10_array.append(obj["isbn_10"]) gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None) return isbn10_array
def imetafrom_isbnlib(isbn10=None,isbn13=None): import isbnlib #TRY FIRST WITH ISBNLIB print("TRY FIRST WITH ISBNLIB") if isbn10: isbn13 = isbnlib.to_isbn13(isbn10) if isbn13: isbn10 = isbnlib.to_isbn10(isbn13) #if isbnlib.is_isbn10(isbn10) or isbnlib.is_isbn13(isbn13) or isbnlib.is_isbn10(isbn13) or isbnlib.is_isbn13(isbn10): primar_info10 = isbnlib.meta(isbn10, service='default', cache='default') primar_info13 = isbnlib.meta(isbn13, service='default', cache='default') if primar_info10: print(primar_info10) if primar_info13: print(primar_info13) if primar_info10 and primar_info13: return(isbn10,isbn13,primar_info10,primar_info13) elif primar_info10: return(isbn10,isbn13,primar_info10) elif primar_info13: return(isbn10,isbn13,primar_info13) else: return(None)
def main(): parser = argparse.ArgumentParser(description="Permite consultar acervo unificado.") parser.add_argument("-isbn", help = "especifica o valor do ISBN13 a ser consultado", required=True) args = parser.parse_args() if not il.is_isbn13(args.isbn): print("ERRO: argumento não é um ISBN válido: %s" % args.isbn) return if not os.path.isfile(JSON_ACERVO_UNIFICADO): print("ERRO: acervo unificado não encontrado") return df_unificado = pd.read_json(JSON_ACERVO_UNIFICADO, dtype=str) for index, row in df_unificado.iterrows(): isbn = str(row['isbn13']) if isbn == args.isbn: print(index) print(il.to_isbn10(isbn)) print(row)
scope = ['https://spreadsheets.google.com/feeds'] credentials = ServiceAccountCredentials.from_json_keyfile_name(GOOGLE_OAUTH, scope) gc = gspread.authorize(credentials) link = GOOGLE_SPREADSHEET trove = Trove() wks = gc.open_by_url(link).sheet1 for i in xrange(631,1000) : row = wks.row_values(i); if row[5] == 'None': isbn = row[0][5:] old_isbn = isbn; print 'getting ', isbn if isbnlib.is_isbn13(isbn): isbn = isbnlib.to_isbn10(isbn) else : isbn = isbnlib.to_isbn13(isbn) canonical = trove.extract(isbn) if not canonical: canonical = trove.extract(old_isbn) if canonical : print '---------------------------' print 'Replacing', i, 'row' #print row row_data = ['isbn:'+isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']] #print row_data print "updating " for j in range(0,len(row_data)): print '-cell',i,j,':' print '\tfrom:', row[j]
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument( '-path', '--GCIS', help= "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] " ) args = parser.parse_args() GCIS = args.GCIS if GCIS is None: GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' print( 'NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT' ) GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format( FILETYPE, IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") #DBpedia ISBN formats a = ISBNS b = canonical(CISBN) c = to_isbn10(CISBN) d = hyphenate(to_isbn10(CISBN)) e = to_isbn13(CISBN) f = hyphenate(to_isbn13(CISBN)) g = V13 h = "ISBN {}".format(CISBN) i = "ISBN {}".format(canonical(CISBN)) j = "ISBN {}".format(hyphenate(to_isbn13(CISBN))) k = "ISBN {}".format(V13) l = "ISBN {}".format(to_isbn10(CISBN)) m = "ISBN {}".format(hyphenate(to_isbn10(CISBN))) tests = [a, b, c, d, e, f, g, h, i, j, k, l, m] for indie in tests: r = QUERY % indie RQUERY(r) if len(RQUERY(r)) != 0: print(RQUERY(r)) break except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format( TITLE, ISBNS, IDEN) print(Error) file.write(Error)
def enrich_document_data(sender, json=None, record=None, index=None, doc_type=None, arguments=None, **dummy_kwargs): """Signal sent before a record is indexed. :param json: The dumped record dictionary which can be modified. :param record: The record being indexed. :param index: The index in which the record will be indexed. :param doc_type: The doc_type for the record. """ if index.split('-')[0] == DocumentsSearch.Meta.index: # HOLDINGS holdings = [] document_pid = record['pid'] es_holdings = HoldingsSearch().filter( 'term', document__pid=document_pid).scan() for holding in es_holdings: data = { 'pid': holding.pid, 'location': { 'pid': holding['location']['pid'], }, 'circulation_category': { 'pid': holding['circulation_category']['pid'], }, 'organisation': { 'organisation_pid': holding['organisation']['pid'], 'library_pid': holding['library']['pid'] } } # Index additional holdings fields into the document record holdings_fields = [ 'call_number', 'second_call_number', 'index', 'enumerationAndChronology', 'supplementaryContent' ] dict_holding = holding.to_dict() for field in holdings_fields: if dict_holding.get(field): data[field] = dict_holding.get(field) # Index holdings notes notes = [ note['content'] for note in dict_holding.get('notes', []) if note ] if notes: data['notes'] = notes # Index holdings local fields if 'local_fields' in holding: data['local_fields'] = dict_holding['local_fields'] # Index items attached to each holdings record es_items = list(ItemsSearch().filter( 'term', holding__pid=holding.pid).scan()) for item in es_items: item = item.to_dict() item_record = { 'pid': item['pid'], 'barcode': item['barcode'], 'status': item['status'], 'available': item['available'], 'local_fields': item.get('local_fields'), 'call_number': item.get('call_number') } item_record = {k: v for k, v in item_record.items() if v} # item acquisition part. # We need to store the acquisition data of the items into the # document. As we need to link acquisition date and # org/lib/loc, we need to store theses data together in a # 'nested' structure. acq_date = item.get('acquisition_date') if acq_date: item_record['acquisition'] = { 'organisation_pid': holding['organisation']['pid'], 'library_pid': holding['library']['pid'], 'location_pid': holding['location']['pid'], 'date': acq_date } # item notes content. # index the content of the public notes into the document. public_notes_content = [ n['content'] for n in item.get('notes', []) if n['type'] in ItemNoteTypes.PUBLIC ] if public_notes_content: item_record['notes'] = public_notes_content # related collection # index the collection title and description item_obj = Item.get_record_by_pid(item['pid']) for collection in item_obj.in_collection(): coll_data = { 'title': collection.get('title'), 'description': collection.get('description') } coll_data = {k: v for k, v in coll_data.items() if v} item_record.setdefault('collections', []).append(coll_data) data.setdefault('items', []).append(item_record) data['available'] = Holding.isAvailable(es_items) holdings.append(data) if holdings: json['holdings'] = holdings # MEF contribution ES index update contributions = create_contributions(json.get('contribution', [])) if contributions: json.pop('contribution', None) json['contribution'] = contributions # TODO: compare record with those in DB to check which authors have # to be deleted from index # Index host document title in child document (part of) if 'partOf' in record: title = {'type': 'partOf'} for part_of in record['partOf']: doc_pid = extracted_data_from_ref(part_of.get('document')) document = Document.get_record_by_pid(doc_pid) for part_of_title in document.get('title', []): if 'mainTitle' in part_of_title: title['partOfTitle'] = part_of_title.get('mainTitle') json['title'].append(title) json['sort_title'] = title_format_text_head(json.get('title', []), with_subtitle=True) # Local fields in JSON local_fields = LocalField.get_local_fields_by_resource( 'doc', document_pid) if local_fields: json['local_fields'] = local_fields # index both ISBN 10 and 13 format def filter_isbn(identified_by): """Filter identified_by for type bf:Isbn.""" return identified_by.get('type') == 'bf:Isbn' filtered_identified_by = filter(filter_isbn, json.get('identifiedBy', [])) isbns = set() for identified_by in filtered_identified_by: isbn = identified_by['value'] isbns.add(isbn) if is_isbn10(isbn): isbns.add(to_isbn13(isbn)) elif is_isbn13(isbn): isbns.add(to_isbn10(isbn)) if isbns: json['isbn'] = list(isbns)
def lookup_by_isbn(number, forceUpdate=False): isbn, price = _process_isbn(number) print("Looking up isbn", isbn, "with price", price) # if length of isbn>0 and isn't "n/a" or "none" if len(isbn) > 0 and not re.match("^n(\s|/){0,1}a|none", isbn, re.I): # first we check our database titles = Title.select(Title.q.isbn == isbn) ##print titles #debug known_title = False the_titles = list(titles) if (len(the_titles) > 0) and (not forceUpdate): ##print "in titles" known_title = the_titles[0] ProductName = the_titles[0].booktitle.format() authors = [] if len(the_titles[0].author) > 0: authors = [x.authorName.format() for x in the_titles[0].author] authors_as_string = ", ".join(authors) categories = [] if len(the_titles[0].categorys) > 0: ##print len(the_titles[0].categorys) ##print the_titles[0].categorys categories = [x.categoryName.format() for x in the_titles[0].categorys] categories_as_string = ", ".join(categories) if price == 0: if len(the_titles[0].books) > 0: ListPrice = max([x.listprice for x in the_titles[0].books]) else: ListPrice = 0 else: ListPrice = price Manufacturer = the_titles[0].publisher.format() Format = the_titles[0].type.format() Kind = the_titles[0].kind.kindName orig_isbn = the_titles[0].origIsbn.format() # if the_titles[0].images: # large_url = the_titles[0].images.largeUrl # med_url = the_titles[0].images.medUrl # small_url = the_titles[0].images.smallUrl # else: # large_url = med_url = small_url = '' large_url = med_url = small_url = "" SpecialOrders = [ tso.id for tso in Title.selectBy( isbn=isbn ).throughTo.specialorder_pivots.filter( TitleSpecialOrder.q.orderStatus == "ON ORDER" ) ] return { "title": ProductName, "authors": authors, "authors_as_string": authors_as_string, "categories_as_string": categories_as_string, "list_price": ListPrice, "publisher": Manufacturer, "isbn": isbn, "orig_isbn": orig_isbn, "large_url": large_url, "med_url": med_url, "small_url": small_url, "format": Format, "kind": Kind, "known_title": known_title, "special_order_pivots": SpecialOrders, } else: # we don't have it yet # if we're using amazon ecs if use_amazon_ecs: sleep(1) # so amazon doesn't get huffy ecs.setLicenseKey(amazon_license_key) ecs.setSecretAccessKey(amazon_secret_key) ecs.setAssociateTag(amazon_associate_tag) ##print "about to search", isbn, isbn[0] amazonBooks = [] idType = "" if len(isbn) == 12: idType = "UPC" elif len(isbn) == 13: # if we are using an internal isbn if isbn.startswith(internal_isbn_prefix): return [] # otherwise search on amazon. elif isbn.startswith("978") or isbn.startswith("979"): idType = "ISBN" else: idType = "EAN" try: print("searching amazon for ", isbn, idType, file=sys.stderr) amazonProds = AmzSear(isbn) print(amazonProds, file=sys.stderr) except (ecs.InvalidParameterValue, HTTPError): pass if amazonProds: print(amazonProds, file=sys.stderr) # inner comprehension tests each prodict for price whose type is in formats # if we find a price which its key is in formats, then we return the coorresponding product format_list = [ "Paperback", "Mass Market Paperback", "Hardcover", "Perfect Paperback", "Pamphlet", "Plastic Comb", "Spiral-bound", "Print on Demand (Paperback)", "DVD", "Calendar", "Board book", "Audio Cassette", "Cards", "Audio CD", "Diary", "DVD-ROM", "Library Binding", "music", "Vinyl", "Health and Beauty", "Hardback", ] prods = [ x for x in amazonProds.values() if [dum for dum in x["prices"].keys() if dum in format_list] ] for prod1 in prods: print(prod1, file=sys.stderr) price_dict = prod1["prices"] listprice = max(price_dict.values()) format = [k for k in format_list if k in price_dict] format = format[0] if not format: continue title = prod1["title"] image_url = prod1["image_url"] authors = [ x.replace("by ", "") for x in prod1["subtext"] if x.startswith("by ") ] auth_list = [ y.strip() for a in [x.split(", ") for x in authors[0].split(" and ")] for y in a ] # we assume any full name less than five characters is an abbreviation like 'Jr.' # so we add it back to the previous authorname abbrev_list = [i for i, x in enumerate(auth_list) if len(x) < 5] for i in abbrev_list: auth_list[i - 1 : i + 1] = [ ", ".join(auth_list[i - 1 : i + 1]) ] return { "title": title, "authors": auth_list, "authors_as_string": ",".join(auth_list), "categories_as_string": "", "list_price": listprice, "publisher": "", "isbn": isbn, "orig_isbn": isbn, "large_url": image_url, "med_url": image_url, "small_url": image_url, "format": format, "kind": "books", "known_title": known_title, "special_orders": [], } else: traceback.print_exc() print("using isbnlib from ecs", file=sys.stderr) isbnlibbooks = [] try: isbnlibbooks = isbnlib.meta(str(isbn)) except: pass if isbnlibbooks: return { "title": isbnlibbooks["Title"], "authors": isbnlibbooks["Authors"], "authors_as_string": ",".join(isbnlibbooks["Authors"]), "categories_as_string": None, "list_price": price, "publisher": isbnlibbooks["Publisher"], "isbn": isbn, "orig_isbn": isbn, "large_url": None, "med_url": None, "small_url": None, "format": None, "kind": "books", "known_title": known_title, "special_orders": [], } else: return {} else: # if we're scraping amazon print("scraping amazon", file=sys.stderr) headers = { "User-Agent": random.sample(user_agents, 1).pop() } amazon_url_template = "http://www.amazon.com/dp/%s/" if len(isbn) == 13: isbn10 = None if isbnlib.is_isbn13(isbn): isbn10 = isbnlib.to_isbn10(isbn) else: return {} if isbn10: with requests.Session() as session: try: print("getting amazon") page_response = session.get( amazon_url_template % isbn10, headers=headers, timeout=0.1 ) print("got response") page_content = BeautifulSoup(page_response.content, "lxml") print("got parsed content") try: booktitle = page_content.select("#productTitle").pop().text except Exception as e: traceback.print_exc() booktitle = '' popover_preload = [ a.text for a in page_content.select( ".author.notFaded .a-popover-preload a.a-link-normal" ) ] author_name = [ a.text for a in page_content.select( ".author.notFaded a.a-link-normal" ) if a.text not in popover_preload ] contributor_role = page_content.select(".contribution span") try: contributor_role = [ re.findall("\w+", cr.text).pop() for cr in contributor_role ] except Exception as e: traceback.print_exc() contributor_role = [] author_role = zip(author_name, contributor_role) try: listprice = ( page_content.select(".a-text-strike").pop().text ) except IndexError as e: print("using bookfinder4u") if "listprice" not in locals(): with requests.Session() as session: bookfinderurl = "http://www.bookfinder4u.com/IsbnSearch.aspx?isbn='%s'&mode=direct" url = bookfinderurl % isbn try: page_response2 = session.get( url, headers=headers, timeout=0.1 ) page_content2 = BeautifulSoup( page_response2.content, "lxml" ) except Exception as e: traceback.print_exc() listprice = 0.0 else: try: matches = re.search( "List\sprice:\s(\w{2,4})\s(\d+(.\d+)?)", page_content2.text, re.I, ) if matches: listprice = matches.groups()[1] else: listprice = 0.00 except Exception as e: traceback.print_exc() listprice = 0.00 try: book_edition = ( page_content.select("#bookEdition").pop().text ) except Exception as e: traceback.print_exc() book_edition = "" try: matches = re.findall( "(?<=imageGalleryData'\s:\s\[)\{.*?\}", page_content.contents[1].text, ) image_url_dict = eval(matches[0]) except Exception as e: traceback.print_exc() image_url_dict = {"mainUrl": "", "thumbUrl": ""} category_items = page_content.select(".zg_hrsr_ladder a") category_items = [a.text for a in category_items] product_details = page_content.select( "#productDetailsTable" ) # ul:first-of-type") try: product_details1 = product_details.pop().text.splitlines() quit_flag = 0 for pd in product_details1: if pd.endswith("pages"): format, numpages = pd.split(":") numpages = numpages.replace(" pages", "").strip() quit_flag += 1 continue if pd.startswith("Publisher: "): matches = re.match( "Publisher: ([^;^(]*)\s?([^(]*)?\W(.*)\W", pd ).groups() publisher = matches[0] publication_date = matches[2] quit_flag += 1 continue if quit_flag == 2: break else: publisher = '' format = '' except Exception as e: traceback.print_exc() publisher = '' format = '' if booktitle: return { "title": booktitle, "authors": author_name, "authors_as_string": ",".join(author_name), "categories_as_string": ",".join(category_items), "list_price": listprice, "publisher": publisher, "isbn": isbn, "orig_isbn": isbn, "large_url": image_url_dict["mainUrl"], "med_url": image_url_dict["mainUrl"], "small_url": image_url_dict["thumbUrl"], "format": format, "kind": "books", "known_title": known_title, "special_orders": [], } except Exception as e: traceback.print_exc() print("using isbnlib from scraper", file=sys.stderr) isbnlibbooks = [] try: isbnlibbooks = isbnlib.meta(str(isbn)) except: pass if isbnlibbooks: return { "title": isbnlibbooks["Title"], "authors": isbnlibbooks["Authors"], "authors_as_string": ",".join( isbnlibbooks["Authors"] ), "categories_as_string": None, "list_price": price, "publisher": isbnlibbooks["Publisher"], "isbn": isbn, "orig_isbn": isbn, "large_url": None, "med_url": None, "small_url": None, "format": None, "kind": "books", "known_title": known_title, "special_orders": [], } else: return {} else: if title: return { "title": title, "authors": author_name, "authors_as_string": ",".join(author_name), "categories_as_string": ",".join(category_items), "list_price": listprice, "publisher": publisher, "isbn": isbn, "orig_isbn": isbn, "large_url": image_url_dict["mainUrl"], "med_url": image_url_dict["mainUrl"], "small_url": image_url_dict["thumbUrl"], "format": format, "kind": "books", "known_title": known_title, "special_orders": [], } else: return {} else: return {}
def main(): #Commnd line arguments parser = argparse.ArgumentParser() parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ") args = parser.parse_args() GCIS = args.GCIS if GCIS is None: GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1' print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT') GCISPAR = parse(GCIS) for x in range(len(GCISPAR)): try: #Extracts book identifier from GCIS# IDEN = GCISPAR[x]["identifier"] match = re.search(r'.*/(.*?)\..*?$', GCIS) if match: FILETYPE = match.groups()[0] #HREF = url that leads to book.json in GCIS-DEV HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN) HREFPAR = parse(HREF) #Extracts book title and isbn from GCIS-DEV d = dict(HREFPAR) TITLE = d['title'] ISBNS = d['isbn'] #Cleans ISBNS to only conatian valid characters CISBN = clean(ISBNS) #V13 = validated canonical ISBN-13 V13 = EAN13(CISBN) if V13 is None: V13 = canonical(CISBN) M = parse(HREF) print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n") #DBpedia ISBN formats a = ISBNS b = canonical(CISBN) c = to_isbn10(CISBN) d = hyphenate(to_isbn10(CISBN)) e = to_isbn13(CISBN) f = hyphenate(to_isbn13(CISBN)) g = V13 h = "ISBN {}" .format(CISBN) i = "ISBN {}" .format(canonical(CISBN)) j = "ISBN {}" .format(hyphenate(to_isbn13(CISBN))) k = "ISBN {}" .format(V13) l = "ISBN {}" .format(to_isbn10(CISBN)) m = "ISBN {}" .format(hyphenate(to_isbn10(CISBN))) tests = [a,b,c,d,e,f,g,h,i,j,k,l,m] for indie in tests: r = QUERY % indie RQUERY(r) if len(RQUERY(r)) != 0: print(RQUERY(r)) break except: Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN) print(Error) file.write(Error)
def amazon(url): ua = UserAgent() headers = {'User-Agent': ua.random} page = requests.get(url, headers=headers) while True: sleep(3) try: parser_page = html.fromstring(page.content) raw_title = parser_page.xpath('//span[@id="productTitle"]//text()') raw_price = parser_page.xpath( '//span[@class="a-size-medium a-color-price offer-price a-text-normal"]' '//text()') raw_sale = parser_page.xpath( '//span[@class="a-size-base a-color-secondary"]//text()') raw_author = parser_page.xpath( '//a[@class="a-link-normal contributorNameID"]//text()') raw_category = parser_page.xpath( '//a[@class="a-link-normal a-color-tertiary"]//text()') raw_availability = parser_page.xpath( '//div[@id="availability"]//text()') ratings = parser_page.xpath('//table[@id="histogramTable"]//tr') reviews = parser_page.xpath( '//div[contains(@id,"reviews-summary")]') title = ''.join(''.join(raw_title).strip()) if raw_title else None sale = ''.join( ''.join(raw_sale).split()).strip() if raw_sale else None category = ' > '.join([i.strip() for i in raw_category ]) if raw_category else None price = ''.join(raw_price).strip() if raw_price else None availability = ''.join( raw_availability).strip() if raw_availability else None review_author = ''.join(raw_author).strip() if raw_author else None title_to_isbn = str(title) isbn = isbnlib.isbn_from_words(title_to_isbn) desc = str(isbnlib.desc(isbn)) description = ''.join(desc).strip() if desc else None isbn10 = isbnlib.to_isbn10(isbn) raw_isbn13 = isbn[:3] + '-' + isbn[3:] isbn_13 = ''.join(raw_isbn13).strip() if raw_isbn13 else None isbn_10 = ''.join(isbn10).strip() if isbn10 else None if not reviews: reviews = parser_page.xpath('//div[@data-hook="review"]') # Rating ratings_dict = {} for ratings in ratings: extracted_rating = ratings.xpath('./td//a//text()') if extracted_rating: rating_key = extracted_rating[0] raw_rating_value = extracted_rating[1] rating_value = raw_rating_value if rating_key: ratings_dict.update({rating_key: rating_value}) # Reviews reviews_list = [] for review in reviews: raw_review_header = review.xpath( './/a[@data-hook="review-title"]//text()') raw_review_author = review.xpath( './/a[contains(@href,"/profile/")]/parent::span//text()') raw_review_rating = review.xpath( './/i[@data-hook="review-star-rating"]//text()') raw_review_posted_date = review.xpath( './/a[contains(@href,"/profile/")]' '/parent::span/following-sibling::span/text()') raw_review_text1 = review.xpath( './/div[@data-hook="review-collapsed"]//text()') raw_review_text2 = review.xpath( './/div//span[@data-action="columnbalancing-showfullreview"]' '/@data-columnbalancing-showfullreview') raw_review_text3 = review.xpath( './/div[contains(@id,"dpReviews")]/div/text()') review_header = ' '.join(' '.join(raw_review_header).split()) review_author = ''.join( ''.join(raw_review_author).split()).strip('By') review_rating = ''.join(raw_review_rating).replace( 'out of 5 stars', '') review_posted_date = dateparser.parse( ''.join(raw_review_posted_date)).strftime('%d %b %Y') review_text = ' '.join(' '.join(raw_review_text1).split()) if raw_review_text2: json_loaded_review_data = json.loads(raw_review_text2[0]) json_loaded_review_data_text = json_loaded_review_data[ 'rest'] cleaned_json_loaded_review_data_text = re.sub( '<.*?>', '', json_loaded_review_data_text) full_review_text = review_text + cleaned_json_loaded_review_data_text else: full_review_text = review_text if not raw_review_text1: full_review_text = ' '.join( ' '.join(raw_review_text3).split()) review_dict = { 'review_header': review_header, 'review_author': review_author, 'review_rating': review_rating, 'review_posted_date': review_posted_date, 'review_text': full_review_text, } reviews_list.append(review_dict) if not price: price = sale if page.status_code != 200: raise ValueError('captha') data = { 'URL': url, 'TITLE': title, 'AUTHOR': review_author, 'PRICE': price, 'SALE': sale, 'CATEGORY': category, 'DESCRIPTION': description, 'ISBN-10': isbn_10, 'ISBN-13': isbn_13, 'AVAILABILITY': availability, 'RATING': ratings_dict, 'REVIEW': reviews_list, } return data except Exception as e: print(e) if e == 'NoneType': return None
def marcxml_parsing(x): # tree = ElementTree.parse("./raw_data/sample_1k_marc.xml") tree = ElementTree.parse(x) collection = tree.getroot() code_336 = pd.read_csv("./raw_data/336_code.csv") code_337 = pd.read_csv("./raw_data/337_code.csv") code_338 = pd.read_csv("./raw_data/338_code.csv") features = [] # list of features # range(len(collection)) for i in range(len(collection)): row = {} print("--------------------- " + str(i)) record = collection[i] leader = record.find('{http://www.loc.gov/MARC21/slim}leader') leader_6 = leader.text[6] leader_17 = leader.text[17] leader_18 = leader.text[18] # print(leader_type) row['leader_6'] = leader_6 row['leader_17'] = leader_17 row['leader_18'] = leader_18 control = record.findall( '{http://www.loc.gov/MARC21/slim}controlfield') F006 = 0 F007 = 0 for c in control: tag = c.get('tag') # print(tag) if tag == '001': oclc_controlnum = c.text # print(physical_desc) row['F001_a'] = oclc_controlnum if tag == '006': F006 = F006 + 1 if tag == '007': F007 = F007 + 1 if tag == '008': value = c.text # print(value) pub_code = value[6] pub_year_1 = value[7:11] pub_year_2 = value[11:15] place = value[15:18] audience = value[22] cont_nature = value[24:28] government = value[28] literary = value[33] language = value[35:38] catalog_source = value[39] # print(place, language, catalog_source) row['F008_06'] = pub_code row['F008_0710'] = pub_year_1 row['F008_1114'] = pub_year_2 row['F008_1517'] = place row['F008_22'] = audience row['F008_2427_a'] = bool(re.search('a', cont_nature)) row['F008_2427_b'] = bool(re.search('b', cont_nature)) row['F008_2427_c'] = bool(re.search('c', cont_nature)) row['F008_2427_d'] = bool(re.search('d', cont_nature)) row['F008_2427_e'] = bool(re.search('e', cont_nature)) row['F008_2427_f'] = bool(re.search('f', cont_nature)) row['F008_2427_g'] = bool(re.search('g', cont_nature)) row['F008_2427_i'] = bool(re.search('i', cont_nature)) row['F008_2427_j'] = bool(re.search('j', cont_nature)) row['F008_2427_k'] = bool(re.search('k', cont_nature)) row['F008_2427_l'] = bool(re.search('l', cont_nature)) row['F008_2427_m'] = bool(re.search('m', cont_nature)) row['F008_2427_n'] = bool(re.search('n', cont_nature)) row['F008_2427_o'] = bool(re.search('o', cont_nature)) row['F008_2427_p'] = bool(re.search('p', cont_nature)) row['F008_2427_q'] = bool(re.search('q', cont_nature)) row['F008_2427_r'] = bool(re.search('r', cont_nature)) row['F008_2427_s'] = bool(re.search('s', cont_nature)) row['F008_2427_t'] = bool(re.search('t', cont_nature)) row['F008_2427_u'] = bool(re.search('u', cont_nature)) row['F008_2427_v'] = bool(re.search('v', cont_nature)) row['F008_2427_w'] = bool(re.search('w', cont_nature)) row['F008_2427_y'] = bool(re.search('y', cont_nature)) row['F008_2427_z'] = bool(re.search('z', cont_nature)) row['F008_2427_2'] = bool(re.search('2', cont_nature)) row['F008_2427_5'] = bool(re.search('5', cont_nature)) row['F008_2427_6'] = bool(re.search('6', cont_nature)) row['F008_28'] = government row['F008_33'] = literary row['F008_3537'] = language row['F008_39'] = catalog_source if place is None: row['008_1517'] = "NA" if language is None: row['008_3537'] = "NA" if len(catalog_source) == 0: row['008_39'] = "NA" row['006_is'] = 1 if F006 > 0 else 0 row['007_is'] = 1 if F007 > 0 else 0 data = record.findall('{http://www.loc.gov/MARC21/slim}datafield') F040_e = 0 F041_is = 0 F050_is = 0 F082_is = 0 F260_is = 0 F264_is = 0 F26x_is = 0 F336_is = 0 F337_is = 0 F338_is = 0 F490_is = 0 F6xxa_is = 0 F6xxv_is = 0 F6xxy_is = 0 F6xxz_is = 0 isbn_list = [] isbn_tag_list = [] F041_a_list = [] F041_h_list = [] F050_a1_list = [] F050_a2_list = [] F082_a1_list = [] F082_a2_list = [] F260_b_list = [] F260_c_list = [] F264_b_list = [] F264_c_list = [] F26x_b_list = [] F26x_c_list = [] F336_b_list = [] F337_b_list = [] F338_b_list = [] F490_a_list = [] F6xx_a_list = [] F6xx_v_list = [] F6xx_y_list = [] F6xx_z_list = [] for d in data: tag = d.get('tag') print("--------------------- " + str(i) + "---- " + tag) if tag == '020': # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': isbn = s.text if len(isbn) == 10 and is_isbn10( str(isbn)) == True and mask(isbn) is not None: isbn_text = str(isbn) isbn_list.append(isbn_text) isbn_tag = '--'.join(mask(isbn).split("-")[0:2]) isbn_tag_list.append(isbn_tag) elif len(isbn) == 13 and is_isbn13( str(isbn)) == True and mask( isbn) is not None and isbn[0:3] == "978": isbn_text = str(isbn) isbn_list.append(isbn_text) isbn_tag = '--'.join( mask(to_isbn10(isbn)).split("-")[0:2]) isbn_tag_list.append(isbn_tag) if tag == "040": subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'e': if s.text == 'rda' or s.text == "RDA": F040_e = F040_e + 1 if tag == "041": F041_is = F041_is + 1 F041_ind1 = d.get('ind1') subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': F041_a_list.append(s.text) if s.get('code') == 'h': F041_h_list.append(s.text) if tag == '050': F050_is = F050_is + 1 subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': match = re.search(r'^[A-Z]{1,3}', str(s.text)) match2 = re.search( r'^[A-Z]{1,3}[0-9]{1,}(?=\.|[A-z]|$| )', str(s.text)) if match and match2: F050_a1_list.append(match.group()) F050_a2_list.append(match2.group()) if tag == '082': F082_is = F082_is + 1 subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': match = re.search(r'^[0-9]{3}', str(s.text)) if match: F082_a1_list.append(match.group()[0]) F082_a2_list.append(match.group()) if tag == '260': F260_is = F260_is + 1 F26x_is = F26x_is + 1 # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'b': F260_b_list.append(s.text) F26x_b_list.append(s.text) if len( re.findall( "printed by |distributed by |distributed in ", s.text.lower())) > 0: F260_is = F260_is - 1 F26x_is = F26x_is - 1 if s.get('code') == 'c': F260_c_list.append(s.text) text_26x = re.findall("\d{4}", s.text) F26x_c_list.extend(text_26x) if tag == '264' and d.get('ind2') == '1': F264_is = F264_is + 1 F26x_is = F26x_is + 1 # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'b': F264_b_list.append(s.text) F26x_b_list.append(s.text) if len( re.findall( "printed by |distributed by |distributed in ", s.text.lower())) > 0: F264_is = F264_is - 1 F26x_is = F26x_is - 1 if s.get('code') == 'c': F264_c_list.append(s.text) text_26x = re.findall("\d{4}", s.text) F26x_c_list.extend(text_26x) if tag == '336': F336_is = F336_is + 1 subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') sub_code_list = [] for t in subfields: sub_code_list.append(t.get("code")) b_is = "b" in sub_code_list a_is = "a" in sub_code_list if b_is > 0: for s in subfields: if s.get('code') == 'b': F336_b_value = s.text if s.get('code') == '2': F336_2_value = s.text elif b_is == 0 and a_is > 0: for s in subfields: if s.get('code') == 'a' and s.text in code_336[ '336_a'].values: text_336b = code_336.loc[code_336['336_a'] == s.text, '336_b'].values[0] F336_b_value = text_336b if s.get('code') == '2': F336_2_value = s.text if "rda" in F336_2_value.lower(): F336_b_list.append(F336_b_value) if tag == '337': F337_is = F337_is + 1 subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') sub_code_list = [] for t in subfields: sub_code_list.append(t.get("code")) b_is = "b" in sub_code_list a_is = "a" in sub_code_list if b_is > 0: for s in subfields: if s.get('code') == 'b': F337_b_value = s.text if s.get('code') == '2': F337_2_value = s.text elif b_is == 0 and a_is > 0: for s in subfields: if s.get('code') == 'a' and s.text in code_337[ '337_a'].values: text_337b = code_337.loc[code_337['337_a'] == s.text, '337_b'].values[0] F337_b_value = text_337b if s.get('code') == '2': F337_2_value = s.text if "rda" in F337_2_value.lower(): F337_b_list.append(F337_b_value) if tag == '338': F338_is = F338_is + 1 subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') sub_code_list = [] for t in subfields: sub_code_list.append(t.get("code")) b_is = "b" in sub_code_list a_is = "a" in sub_code_list if b_is > 0: for s in subfields: if s.get('code') == 'b': F338_b_value = s.text if s.get('code') == '2': F338_2_value = s.text elif b_is == 0 and a_is > 0: for s in subfields: if s.get('code') == 'a' and s.text in code_338[ '338_a'].values: text_338b = code_338.loc[code_338['338_a'] == s.text, '338_b'].values[0] F338_b_value = text_338b if s.get('code') == '2': F338_2_value = s.text if "rda" in F338_2_value.lower(): F338_b_list.append(F338_b_value) if tag == '490': F490_is = F490_is + 1 # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': F490_a_list.append(s.text) if tag in ['600', '610', '611', '630', '650' ] and d.get('ind2') == "0": # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': F6xxa_is = F6xxa_is + 1 F6xx_a_list.append(clean_text(s.text)) if s.get('code') == 'v': F6xxv_is = F6xxv_is + 1 F6xx_v_list.append(clean_text(s.text)) if s.get('code') == 'y': F6xxy_is = F6xxy_is + 1 F6xx_y_list.append(clean_text(s.text)) if s.get('code') == 'z': F6xxz_is = F6xxz_is + 1 F6xx_z_list.append(clean_text(s.text)) if tag == "651" and d.get('ind2') == "0": # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': F6xxz_is = F6xxz_is + 1 F6xx_z_list.append(clean_text(s.text)) if s.get('code') == 'v': F6xxv_is = F6xxv_is + 1 F6xx_v_list.append(clean_text(s.text)) if s.get('code') == 'y': F6xxy_is = F6xxy_is + 1 F6xx_y_list.append(clean_text(s.text)) if s.get('code') == 'z': F6xxz_is = F6xxz_is + 1 F6xx_z_list.append(clean_text(s.text)) if tag == "655" and d.get('ind2') == "0": # print(d) subfields = d.findall( '{http://www.loc.gov/MARC21/slim}subfield') for s in subfields: if s.get('code') == 'a': F6xxv_is = F6xxv_is + 1 F6xx_v_list.append(clean_text(s.text)) if s.get('code') == 'v': F6xxv_is = F6xxv_is + 1 F6xx_v_list.append(clean_text(s.text)) if s.get('code') == 'y': F6xxy_is = F6xxy_is + 1 F6xx_y_list.append(clean_text(s.text)) if s.get('code') == 'z': F6xxz_is = F6xxz_is + 1 F6xx_z_list.append(clean_text(s.text)) # print(code) # print(value) isbn_list1 = set(isbn_list) isbn_tag_list1 = set(isbn_tag_list) if (len(isbn_tag_list) > 0): row['isbn'] = " ;; ".join(set(isbn_list1)) row['isbn_tag'] = " ;; ".join(set(isbn_tag_list1)) row['isbn1'] = isbn_list[0] row['isbn_tag1'] = isbn_tag_list[0] else: row['isbn'] = "NA" row['isbn_tag'] = "NA" row['isbn1'] = "NA" row['isbn_tag1'] = "NA" if F040_e > 0: row['F040_e'] = 1 else: row['F040_e'] = 0 if F041_is > 0: row['F041_ind1'] = F041_ind1 row['F041_a'] = " ;; ".join(F041_a_list) row['F041_h'] = " ;; ".join(F041_h_list) else: row['F041_ind1'] = "NA" row['F041_a'] = "NA" row['F041_h'] = "NA" if len(F050_a1_list) > 0: row['F050_a1'] = " ;; ".join(set(F050_a1_list)) row['F050_a2'] = " ;; ".join(set(F050_a2_list)) else: row['F050_a1'] = "NA" row['F050_a2'] = "NA" if len(F082_a1_list) > 0: row['F082_a1'] = " ;; ".join(set(F082_a1_list)) row['F082_a2'] = " ;; ".join(set(F082_a2_list)) else: row['F082_a1'] = "NA" row['F082_a2'] = "NA" row['F260_is'] = F260_is if F260_is > 0: row['F260_b'] = " ;; ".join(F260_b_list) row['F260_c'] = " ;; ".join(F260_c_list) else: row['F260_b'] = "NA" row['F260_c'] = "NA" row['F264_is'] = F264_is if F264_is > 0: row['F264_b'] = " ;; ".join(F264_b_list[0:0 + F26x_is]) row['F264_c'] = " ;; ".join(F264_c_list) else: row['F264_b'] = "NA" row['F264_c'] = "NA" row['F26x_is'] = F26x_is if F26x_is > 0: row['F26x_b'] = " ;; ".join(set(F26x_b_list[0:0 + F26x_is])) row['F26x_c'] = " ;; ".join(set(F26x_c_list)) else: row['F26x_b'] = "NA" row['F26x_c'] = "NA" if F336_is > 0: F336_b_text = F336_b_list row['F336_b'] = " ;; ".join(F336_b_text) row['F336_b_txt'] = bool(re.search('txt', row['F336_b'])) row['F336_b_sti'] = bool(re.search('sti', row['F336_b'])) row['F336_b_cri'] = bool(re.search('cri', row['F336_b'])) row['F336_b_spw'] = bool(re.search('spw', row['F336_b'])) row['F336_b_tct'] = bool(re.search('tct', row['F336_b'])) else: row['F336_b'] = "NA" row['F336_b_txt'] = "" row['F336_b_sti'] = "" row['F336_b_cri'] = "" row['F336_b_spw'] = "" row['F336_b_tct'] = "" if F337_is > 0: F337_b_text = F337_b_list row['F337_b'] = " ;; ".join(F337_b_text) row['F337_b_c'] = bool(re.search('c', row['F337_b'])) row['F337_b_h'] = bool(re.search('h', row['F337_b'])) row['F337_b_n'] = bool(re.search('n', row['F337_b'])) row['F337_b_s'] = bool(re.search('s', row['F337_b'])) else: row['F337_b'] = "NA" row['F337_b_c'] = "" row['F337_b_h'] = "" row['F337_b_n'] = "" row['F337_b_s'] = "" if F338_is > 0: F338_b_text = F338_b_list row['F338_b'] = " ;; ".join(F338_b_text) row['F338_b_cd'] = bool(re.search('cd', row['F338_b'])) row['F338_b_cr'] = bool(re.search('cr', row['F338_b'])) row['F338_b_hd'] = bool(re.search('hd', row['F338_b'])) row['F338_b_he'] = bool(re.search('he', row['F338_b'])) row['F338_b_nb'] = bool(re.search('nb', row['F338_b'])) row['F338_b_sd'] = bool(re.search('sd', row['F338_b'])) else: row['F338_b'] = "NA" row['F338_b_cd'] = "" row['F338_b_cr'] = "" row['F338_b_hd'] = "" row['F338_b_he'] = "" row['F338_b_nb'] = "" row['F338_b_sd'] = "" if F490_is > 0: row['F490_a'] = " ;; ".join(F490_a_list) else: row['F490_a'] = "NA" if F6xxa_is > 0: row['F6xx_a'] = " ;; ".join(set(F6xx_a_list)) else: row['F6xx_a'] = "NA" if F6xxv_is > 0: row['F6xx_v'] = " ;; ".join(set(F6xx_v_list)) else: row['F6xx_v'] = "NA" if F6xxy_is > 0: row['F6xx_y'] = " ;; ".join(set(F6xx_y_list)) else: row['F6xx_y'] = "NA" if F6xxz_is > 0: row['F6xx_z'] = " ;; ".join(set(F6xx_z_list)) else: row['F6xx_z'] = "NA" features.append(row) df = pd.DataFrame(features) return (df)
def update_cover(self): image = requests.get('http://images.amazon.com/images/P/%s.01._SS500_SCLZZZZZZZ_.jpg' % (isbnlib.to_isbn10(str(self.isbn)), )) if image.status_code == 200 and len(image.content) > 50: img_temp = NamedTemporaryFile(delete=True) img_temp.write(image.content) img_temp.flush() self.cover.save('%s.jpg' % (self.isbn,), File(img_temp)) else: self.cover.delete()
def enrich_document_data(sender, json=None, record=None, index=None, doc_type=None, arguments=None, **dummy_kwargs): """Signal sent before a record is indexed. :param json: The dumped record dictionary which can be modified. :param record: The record being indexed. :param index: The index in which the record will be indexed. :param doc_type: The doc_type for the record. """ if index.split('-')[0] == DocumentsSearch.Meta.index: # HOLDINGS holdings = [] document_pid = record['pid'] es_holdings = HoldingsSearch()\ .filter('term', document__pid=document_pid)\ .scan() for holding in es_holdings: holding = holding.to_dict() hold_data = { 'pid': holding['pid'], 'location': { 'pid': holding['location']['pid'], }, 'circulation_category': [{ 'pid': holding['circulation_category']['pid'] }], 'organisation': { 'organisation_pid': holding['organisation']['pid'], 'library_pid': holding['library']['pid'] } } # Index additional holdings fields into the document record holdings_fields = [ 'call_number', 'second_call_number', 'index', 'enumerationAndChronology', 'supplementaryContent', 'local_fields' ] for field in holdings_fields: if field in holding: hold_data[field] = holding.get(field) # Index holdings notes notes = [n['content'] for n in holding.get('notes', []) if n] if notes: hold_data['notes'] = notes # Index items attached to each holdings record es_items = ItemsSearch()\ .filter('term', holding__pid=holding['pid'])\ .scan() for item in es_items: item = item.to_dict() item_data = { 'pid': item['pid'], 'barcode': item['barcode'], 'status': item['status'], 'local_fields': item.get('local_fields'), 'call_number': item.get('call_number'), 'second_call_number': item.get('second_call_number'), 'temporary_item_type': item.get('temporary_item_type') } if 'temporary_item_type' in item: hold_data['circulation_category'].append( {'pid': item['temporary_item_type']['pid']}) item_data = {k: v for k, v in item_data.items() if v} # item acquisition part. # We need to store the acquisition data of the items into the # document. As we need to link acquisition date and # org/lib/loc, we need to store theses data together in a # 'nested' structure. acq_date = item.get('acquisition_date') if acq_date: item_data['acquisition'] = { 'organisation_pid': holding['organisation']['pid'], 'library_pid': holding['library']['pid'], 'location_pid': holding['location']['pid'], 'date': acq_date } # item notes content. # index the content of the public notes into the document. public_notes_content = [ n['content'] for n in item.get('notes', []) if n['type'] in ItemNoteTypes.PUBLIC ] if public_notes_content: item_data['notes'] = public_notes_content hold_data.setdefault('items', []).append(item_data) holdings.append(hold_data) if holdings: json['holdings'] = holdings # MEF contribution ES index update contributions = create_contributions(json.get('contribution', [])) if contributions: json.pop('contribution', None) json['contribution'] = contributions # TODO: compare record with those in DB to check which authors have # to be deleted from index # Index host document title in child document (part of) if 'partOf' in record: title = {'type': 'partOf'} for part_of in record['partOf']: doc_pid = extracted_data_from_ref( part_of.get('document') ) document = Document.get_record_by_pid(doc_pid) for part_of_title in document.get('title', []): if 'mainTitle' in part_of_title: title['partOfTitle'] = part_of_title.get( 'mainTitle' ) json['title'].append(title) # sort title sort_title = title_format_text_head( json.get('title', []), with_subtitle=True ) language = language_mapping(json.get('language')[0].get('value')) if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False): sort_title = current_app.\ extensions['reroils-normalizer-stop-words'].\ normalize(sort_title, language) json['sort_title'] = sort_title # Local fields in JSON local_fields = LocalField.get_local_fields_by_resource( 'doc', document_pid) if local_fields: json['local_fields'] = local_fields # index both ISBN 10 and 13 format def filter_isbn(identified_by): """Filter identified_by for type bf:Isbn.""" return identified_by.get('type') == 'bf:Isbn' filtered_identified_by = filter( filter_isbn, json.get('identifiedBy', []) ) isbns = set() for identified_by in filtered_identified_by: isbn = identified_by['value'] isbns.add(isbn) if is_isbn10(isbn): isbns.add(to_isbn13(isbn)) elif is_isbn13(isbn): isbns.add(to_isbn10(isbn)) if isbns: json['isbn'] = list(isbns) # Populate sort date new and old for use in sorting pub_provisions = [ p for p in record.get('provisionActivity', []) if p['type'] == 'bf:Publication' ] pub_provision = next(iter(pub_provisions), None) if pub_provision: json['sort_date_new'] = \ pub_provision.get('endDate', pub_provision.get('startDate')) json['sort_date_old'] = pub_provision.get('startDate')
def com_isbn_13_to_10(isbn_string): return isbnlib.to_isbn10(isbn_string)