Example #1
0
    async def check(self, entry):
        length = self._cfg.get('isbn_length', entry, 13)
        if not length:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if length not in (10, 13):
            raise ConfigurationError(
                "The option 'isbn_length' must be either of 10 or 13.")

        if length == 10:
            if not is_isbn10(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 10.".format(isbn),
                         "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn)))
                        ]
        elif length == 13:
            if not is_isbn13(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 13.".format(isbn),
                         "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn)))
                        ]

        return []
Example #2
0
File: pdf.py Project: niwyss/Book
	def extractISBN(self):

		isbn = None;

		rsrcmgr = PDFResourceManager()
		retstr = StringIO()
		device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
		interpreter = PDFPageInterpreter(rsrcmgr, device)

		for page in PDFPage.get_pages(self.pdf, set(), maxpages=0, password="",caching=True, check_extractable=True):

			# Get the text from the page
			interpreter.process_page(page)
			text = retstr.getvalue()
			retstr.truncate(0)

			# Extract ISBN
			isbn = self.searchCodeInPage(text)

			if isbn:
				break

		device.close()
		retstr.close()

		# Convert to ISBN 10 and 13
		if isbnlib.is_isbn10(isbn):
			self.isbn10 = isbn
			self.isbn13 = isbnlib.to_isbn13(self.isbn10)
		elif isbnlib.is_isbn13(isbn):
			self.isbn13 = isbn
			self.isbn10 = isbnlib.to_isbn10(self.isbn13)
Example #3
0
def format_data(termname):
	file_names = glob.glob('../terms/' + termname + '/course_books/*.json')
	dic = {}
	good_urls = json.loads(open('../good_urls.json', 'r').read())
	bad_urls = json.loads(open('../bad_urls.json', 'r').read())
	for f in file_names:
		with open(f) as input:
			data = json.loads(input.read())
			for d in data:
				ret = []
				for i in range(len(d['isbns'])):
					number = isbnlib.to_isbn10(d['isbns'][i])
					name = ','.join(d['names'][i].split(',')[:-1]) + ', Isbn: ' + d['isbns'][i]
					if number:
						url ='http://www.amazon.com/gp/product/' + number + '/ref=as_li_tl?ie=UTF8&camp=1789&creative=9325&creativeASIN=' + number + '&linkCode=as2&tag=mocksched-20&linkId=EMBDL7BV7IXRB44G'
						url_for_checking = 'http://www.amazon.com/gp/product/' + number + '/ref=as_li_tl?ie=UTF8&camp=1789&creative=9325&creativeASIN=' + number + '&linkCode=as2&linkId=EMBDL7BV7IXRB44G'
						if url in good_urls:
							ret.append([d['status'][i],url,name])
						elif url in bad_urls:
							ret.append([d['status'][i],'',name])
						elif check_url(url_for_checking):
							ret.append([d['status'][i],'',name])
							bad_urls.append(url)
						else:
							ret.append([d['status'][i],url,name])
							good_urls.append(url)
					else:
						ret.append([d['status'][i],'',name])
				dic[d['title']] = ret
	with open('../terms/' + termname + '/books.json','w') as output:
		json.dump(dic,output)
	with open('../good_urls.json','w') as good:
		json.dump(good_urls,good)
	with open('../bad_urls.json','w') as bad:
		json.dump(bad_urls,bad)
Example #4
0
	def isbn(self,isbn):
		#adds isbn to google spread sheet
		
		#check if valid
		clean_isbn = isbnlib.clean(isbn)
		if isbnlib.notisbn(clean_isbn):
			return "not valid isbn"
		
		#should check if has been collected before

		canonical = None;
		#first check trove
		canonical = self.trove.extract(clean_isbn);
		if not canonical :
			# try alternative isbn form
			print "trying alternative form "
			alt_isbn = clean_isbn;
			if isbnlib.is_isbn13(clean_isbn):
				alt_isbn = isbnlib.to_isbn10(clean_isbn)
			else :
				alt_isbn = isbnlib.to_isbn13(clean_isbn)
			canonical = self.trove.extract(alt_isbn);
			if canonical :
				clean_isbn = alt_isbn
		if not canonical :
			canonical = self.__reduce_metadata(clean_isbn,['merge','isbndb','openl'])
			if not canonical:
				return "no metadata found for isbn: " + clean_isbn
			canonical['source']='isbnlib'
			canonical["Authors"] = u', '.join(canonical["Authors"])
			canonical['link']=None

		row_data = ['isbn:'+clean_isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']]
		return self.__add_and_render(row_data)
Example #5
0
def extract_identifiers_from_row(row, isbn_columns):
    cols = [int(x) for x in isbn_columns.split(',')]
    isbns = set()
    for isbn_column in cols:
        raw = row[isbn_column].strip('"=')
        isbns.add(raw)
        # Transform to ISBN 10 or 13.
        if isbnlib.is_isbn13(raw):
            isbns.add(isbnlib.to_isbn10(raw))
        elif isbnlib.is_isbn10(raw):
            isbns.add(isbnlib.to_isbn13(raw))
    return isbns
Example #6
0
def isbn_to_asin(isbn):  # returns isbn10 (asin)
    clean = isbnlib.canonical(isbn)
    if (len(isbn) == 10):
        if (isbnlib.is_isbn10(clean)):
            return clean
        else:
            return '0'
    elif (len(isbn) == 13):
        if (isbnlib.is_isbn13(clean)):
            return isbnlib.to_isbn10(clean)
    else:
        return '0'
Example #7
0
def add_book():
    form = AddBookForm()
    if form.validate_on_submit():
        isbn = request.form['isbn']
        if isbnlib.is_isbn13(isbn):
            isbn = isbnlib.to_isbn10(isbn)
        if not isbnlib.is_isbn10(isbn):
            flash('Enter valid ISBN', 'error')
            return redirect(url_for('admin_page.add_book'))

        book_data = isbnlib.meta(isbn)
        book_cover = isbnlib.cover(isbn)
Example #8
0
def converter():
    if request.method == 'POST':
        ISBN_13 = isbnlib.canonical(request.form['ISBN-13'])
        ISBN_10 = isbnlib.canonical(request.form['ISBN-10'])

        #converts ISBNS
        flash(isbnlib.to_isbn13(ISBN_10))
        flash(isbnlib.to_isbn10(ISBN_13))

        return redirect(url_for('book.converter'))

    return render_template('book/converter.html')
Example #9
0
def is_isbn_code(search):
    """checks if the received string is valid isbn number"""
    check = ''.join(ch for ch in search if ch.isalnum())

    if is_isbn13(check):
        return to_isbn10(check)

    if is_isbn10(check):
        return check

    else:
        return False
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=argparse.FileType("r"))

    args = parser.parse_args()

    temp = []
    for item in yaml.load(args.input):
        store = OrderedDict([
            ("id", item["id"]),
            ("title", item["title"]["name"]),
            ("links", []),
        ])
        for identifier in item["identifier"]:
            if identifier["domain"] != "isbn":
                continue
            store["links"].append({
                "domain":
                "amazon.co.jp",
                "url":
                "https://www.amazon.co.jp/dp/{}".format(
                    isbnlib.to_isbn10(identifier["id"])),
                "id":
                isbnlib.to_isbn10(identifier["id"]),
            })
            store["links"].append({
                "domain": "kindle.amazon.co.jp",
                "url": "https://www.amazon.co.jp/dp/",
                "id": "",
            })
        temp.append(store)

    print(
        yaml.dump(temp, allow_unicode=True, default_flow_style=False,
                  indent=1))

    return 0
Example #11
0
    def update_cover(self):

        image = requests.get(
            'http://images.amazon.com/images/P/%s.01._SS500_SCLZZZZZZZ_.jpg' %
            (isbnlib.to_isbn10(str(self.isbn)), ))

        if image.status_code == 200 and len(image.content) > 50:
            img_temp = NamedTemporaryFile(delete=True)
            img_temp.write(image.content)
            img_temp.flush()

            self.cover.save('%s.jpg' % (self.isbn, ), File(img_temp))

        else:
            self.cover.delete()
Example #12
0
def tupleData(isbn, data, quan, cond, pallet, lot):
    temp = ()
    tempList = list(temp)
    for value in data.values():
        tempList.append(value)

    tempList.append(isbnlib.to_isbn10(isbn)) # Add ISBN 10
    tempList.append(quan) # Add the quantity of the book
    tempList.append(cond) # Add the condition of the book
    tempList.append(lot) # Add the lot number
    tempList.append(pallet) # Add the pallet number
    #tempList.append(isbnlib.cover(isbn))  # Add cover of ISBN

    temp = tuple(tempList)
    return temp
Example #13
0
def query(isbn):
    """Query the BnF Catalogue Général service for metadata."""
    # Quirk (see issue #1)
    isbn10 = to_isbn10(isbn)
    if isbn10:
        isbn_query = "(bib.isbn%20all%20%22{isbn10}%22%20or%20bib.isbn%20"\
                     "all%20%22{isbn}%22)".format(isbn10=isbn10, isbn=isbn)
    else:
        isbn_query = "bib.isbn%20all%20%22{isbn}%22".format(isbn=isbn)
    data = wquery(SERVICE_URL.format(isbn=isbn_query),
                  user_agent=UA,
                  parser=parser_bnf)
    if not data:  # pragma: no cover
        LOGGER.debug('No data from BnF Catalogue Général for isbn %s', isbn)
        return {}
    return _mapper(isbn, data)
 def test_lookup_by_isbn10_is_invalid(self):
     # translation table of checkdigits to wrong ones (digit plus 1)
     tr_table = dict(
         list(
             zip(
                 ["x", "X"] + list(map(str, list(range(9, -1, -1)))),
                 ["0", "0", "x"] + list(map(str, list(range(9, 0, -1)))),
             )
         )
     )
     random_item = random.sample(list(Title.select("isbn RLIKE '^[0-9]{13}$'")), 1)[
         0
     ]
     wrong_isbn = isbnlib.to_isbn10(random_item.isbn)
     wrong_isbn = wrong_isbn[0:9] + tr_table[wrong_isbn[9]]
     with self.assertRaises((isbnlib.NotValidISBNError, isbnlib._exceptions.NotValidISBNError)):
         result = inventory.lookup_by_isbn(wrong_isbn)
Example #15
0
def get_isbn10(reference, verbose=False):
    isbn10_array = []
            
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["isbn10", "isbn-10", "isbn 10"]):
        if iden["identifier"] not in isbn10_array:
            isbn10_array.append(iden["identifier"])
            
    if isbn10_array:
        return isbn10_array
                
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["isbn13", "isbn-13", "isbn 13"]):
        try:
            isbn10_temp = isbnlib.to_isbn10(iden["identifier"])
            if isbn10_temp not in isbn10_array:
                isbn10_array.append(isbn10_temp)
                gnomics.objects.reference.Reference.add_identifier(reference, identifier=isbn10_temp, identifier_type="ISBN-10", source="ISBNlib", language=None)
        except:
            if verbose:
                print("No corresponding ISBN-10 found.")
            
        for obj in gnomics.reference.Reference.openlibrary(ref):
            if obj["isbn_10"] not in isbn10_array:
                isbn10_array.append(obj["isbn_10"])
                gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None)
                
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["openlibrary", "openlibrary id", "openlibrary identifier", "olid"]):
        for obj in gnomics.reference.Reference.openlibrary(ref):
            if obj["isbn_10"] not in isbn10_array:
                isbn10_array.append(obj["isbn_10"])
                gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None)
                
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["lccn", "library of congress control number"]):
        for obj in gnomics.reference.Reference.openlibrary(ref):
            if obj["isbn_10"] not in isbn10_array:
                isbn10_array.append(obj["isbn_10"])
                gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None)
    
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(reference.identifiers, ["oclc", "oclc number", "oclc control number"]):
        for obj in gnomics.reference.Reference.openlibrary(ref):
            if obj["isbn_10"] not in isbn10_array:
                isbn10_array.append(obj["isbn_10"])
                gnomics.objects.reference.Reference.add_identifier(reference, identifier=obj["isbn_10"], identifier_type="ISBN-10", source="OpenLibrary", language=None)
                
    return isbn10_array
Example #16
0
def imetafrom_isbnlib(isbn10=None,isbn13=None):
    import isbnlib
    #TRY FIRST WITH ISBNLIB
    print("TRY FIRST WITH ISBNLIB")
    if isbn10: isbn13 = isbnlib.to_isbn13(isbn10)
    if isbn13: isbn10 = isbnlib.to_isbn10(isbn13)
    #if isbnlib.is_isbn10(isbn10) or isbnlib.is_isbn13(isbn13) or isbnlib.is_isbn10(isbn13) or isbnlib.is_isbn13(isbn10):
    primar_info10 = isbnlib.meta(isbn10, service='default', cache='default')
    primar_info13 = isbnlib.meta(isbn13, service='default', cache='default')
    if primar_info10: print(primar_info10)
    if primar_info13: print(primar_info13)
    if primar_info10 and primar_info13:
        return(isbn10,isbn13,primar_info10,primar_info13)
    elif primar_info10:
        return(isbn10,isbn13,primar_info10)
    elif primar_info13:
        return(isbn10,isbn13,primar_info13)
    else:
        return(None)
def main():

    parser = argparse.ArgumentParser(description="Permite consultar acervo unificado.")
    parser.add_argument("-isbn", help = "especifica o valor do ISBN13 a ser consultado", required=True)

    args = parser.parse_args()

    if not il.is_isbn13(args.isbn):
        print("ERRO: argumento não é um ISBN válido: %s" % args.isbn)
        return 

    if not os.path.isfile(JSON_ACERVO_UNIFICADO):
        print("ERRO: acervo unificado não encontrado")
        return 

    df_unificado = pd.read_json(JSON_ACERVO_UNIFICADO, dtype=str)

    for index, row in df_unificado.iterrows():
        isbn = str(row['isbn13'])
        if isbn == args.isbn:
            print(index)
            print(il.to_isbn10(isbn))
            print(row)
Example #18
0
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(GOOGLE_OAUTH, scope)
gc = gspread.authorize(credentials)
link = GOOGLE_SPREADSHEET
trove = Trove()
wks = gc.open_by_url(link).sheet1

for i in xrange(631,1000) :
	row = wks.row_values(i);
	if row[5] == 'None':
		isbn = row[0][5:]
		old_isbn = isbn;
		print 'getting ', isbn

		if isbnlib.is_isbn13(isbn):
			isbn = isbnlib.to_isbn10(isbn)
		else : 
			isbn = isbnlib.to_isbn13(isbn)
		canonical = trove.extract(isbn)
		if not canonical:
			canonical = trove.extract(old_isbn)
		if canonical :
			print '---------------------------'
			print 'Replacing', i, 'row'
			#print row
			row_data = ['isbn:'+isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']]
			#print row_data
			print "updating "
			for j in range(0,len(row_data)):
				print '-cell',i,j,':'
				print '\tfrom:', row[j]
Example #19
0
def main():
    #Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-path',
        '--GCIS',
        help=
        "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] "
    )
    args = parser.parse_args()
    GCIS = args.GCIS

    if GCIS is None:
        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
        print(
            'NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT'
        )

    GCISPAR = parse(GCIS)
    for x in range(len(GCISPAR)):
        try:
            #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match = re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
        #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format(
                FILETYPE, IDEN)
            HREFPAR = parse(HREF)
            #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
            #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
            #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)

            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS,
                  '\n\n\t', "isbn_mod:", V13, "\n\n")

            #DBpedia ISBN formats
            a = ISBNS
            b = canonical(CISBN)
            c = to_isbn10(CISBN)
            d = hyphenate(to_isbn10(CISBN))
            e = to_isbn13(CISBN)
            f = hyphenate(to_isbn13(CISBN))
            g = V13
            h = "ISBN {}".format(CISBN)
            i = "ISBN {}".format(canonical(CISBN))
            j = "ISBN {}".format(hyphenate(to_isbn13(CISBN)))
            k = "ISBN {}".format(V13)
            l = "ISBN {}".format(to_isbn10(CISBN))
            m = "ISBN {}".format(hyphenate(to_isbn10(CISBN)))

            tests = [a, b, c, d, e, f, g, h, i, j, k, l, m]

            for indie in tests:
                r = QUERY % indie
                RQUERY(r)
                if len(RQUERY(r)) != 0:
                    print(RQUERY(r))
                    break

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(
                TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
Example #20
0
def enrich_document_data(sender,
                         json=None,
                         record=None,
                         index=None,
                         doc_type=None,
                         arguments=None,
                         **dummy_kwargs):
    """Signal sent before a record is indexed.

    :param json: The dumped record dictionary which can be modified.
    :param record: The record being indexed.
    :param index: The index in which the record will be indexed.
    :param doc_type: The doc_type for the record.
    """
    if index.split('-')[0] == DocumentsSearch.Meta.index:
        # HOLDINGS
        holdings = []
        document_pid = record['pid']
        es_holdings = HoldingsSearch().filter(
            'term', document__pid=document_pid).scan()
        for holding in es_holdings:
            data = {
                'pid': holding.pid,
                'location': {
                    'pid': holding['location']['pid'],
                },
                'circulation_category': {
                    'pid': holding['circulation_category']['pid'],
                },
                'organisation': {
                    'organisation_pid': holding['organisation']['pid'],
                    'library_pid': holding['library']['pid']
                }
            }
            # Index additional holdings fields into the document record
            holdings_fields = [
                'call_number', 'second_call_number', 'index',
                'enumerationAndChronology', 'supplementaryContent'
            ]
            dict_holding = holding.to_dict()
            for field in holdings_fields:
                if dict_holding.get(field):
                    data[field] = dict_holding.get(field)
            # Index holdings notes
            notes = [
                note['content'] for note in dict_holding.get('notes', [])
                if note
            ]
            if notes:
                data['notes'] = notes
            # Index holdings local fields
            if 'local_fields' in holding:
                data['local_fields'] = dict_holding['local_fields']

            # Index items attached to each holdings record
            es_items = list(ItemsSearch().filter(
                'term', holding__pid=holding.pid).scan())
            for item in es_items:
                item = item.to_dict()
                item_record = {
                    'pid': item['pid'],
                    'barcode': item['barcode'],
                    'status': item['status'],
                    'available': item['available'],
                    'local_fields': item.get('local_fields'),
                    'call_number': item.get('call_number')
                }
                item_record = {k: v for k, v in item_record.items() if v}

                # item acquisition part.
                #   We need to store the acquisition data of the items into the
                #   document. As we need to link acquisition date and
                #   org/lib/loc, we need to store theses data together in a
                #   'nested' structure.
                acq_date = item.get('acquisition_date')
                if acq_date:
                    item_record['acquisition'] = {
                        'organisation_pid': holding['organisation']['pid'],
                        'library_pid': holding['library']['pid'],
                        'location_pid': holding['location']['pid'],
                        'date': acq_date
                    }
                # item notes content.
                #   index the content of the public notes into the document.
                public_notes_content = [
                    n['content'] for n in item.get('notes', [])
                    if n['type'] in ItemNoteTypes.PUBLIC
                ]
                if public_notes_content:
                    item_record['notes'] = public_notes_content

                # related collection
                #   index the collection title and description
                item_obj = Item.get_record_by_pid(item['pid'])
                for collection in item_obj.in_collection():
                    coll_data = {
                        'title': collection.get('title'),
                        'description': collection.get('description')
                    }
                    coll_data = {k: v for k, v in coll_data.items() if v}
                    item_record.setdefault('collections', []).append(coll_data)

                data.setdefault('items', []).append(item_record)
            data['available'] = Holding.isAvailable(es_items)
            holdings.append(data)

        if holdings:
            json['holdings'] = holdings

        # MEF contribution ES index update
        contributions = create_contributions(json.get('contribution', []))
        if contributions:
            json.pop('contribution', None)
            json['contribution'] = contributions
        # TODO: compare record with those in DB to check which authors have
        # to be deleted from index
        # Index host document title in child document (part of)
        if 'partOf' in record:
            title = {'type': 'partOf'}
            for part_of in record['partOf']:
                doc_pid = extracted_data_from_ref(part_of.get('document'))
                document = Document.get_record_by_pid(doc_pid)
                for part_of_title in document.get('title', []):
                    if 'mainTitle' in part_of_title:
                        title['partOfTitle'] = part_of_title.get('mainTitle')
            json['title'].append(title)

        json['sort_title'] = title_format_text_head(json.get('title', []),
                                                    with_subtitle=True)
        # Local fields in JSON
        local_fields = LocalField.get_local_fields_by_resource(
            'doc', document_pid)
        if local_fields:
            json['local_fields'] = local_fields
        # index both ISBN 10 and 13 format

        def filter_isbn(identified_by):
            """Filter identified_by for type bf:Isbn."""
            return identified_by.get('type') == 'bf:Isbn'

        filtered_identified_by = filter(filter_isbn,
                                        json.get('identifiedBy', []))
        isbns = set()
        for identified_by in filtered_identified_by:
            isbn = identified_by['value']
            isbns.add(isbn)
            if is_isbn10(isbn):
                isbns.add(to_isbn13(isbn))
            elif is_isbn13(isbn):
                isbns.add(to_isbn10(isbn))
        if isbns:
            json['isbn'] = list(isbns)
def lookup_by_isbn(number, forceUpdate=False):
    isbn, price = _process_isbn(number)
    print("Looking up isbn", isbn, "with price", price)

    # if length of isbn>0 and isn't "n/a" or "none"
    if len(isbn) > 0 and not re.match("^n(\s|/){0,1}a|none", isbn, re.I):
        # first we check our database
        titles = Title.select(Title.q.isbn == isbn)
        ##print titles #debug
        known_title = False
        the_titles = list(titles)
        if (len(the_titles) > 0) and (not forceUpdate):
            ##print "in titles"
            known_title = the_titles[0]
            ProductName = the_titles[0].booktitle.format()
            authors = []
            if len(the_titles[0].author) > 0:
                authors = [x.authorName.format() for x in the_titles[0].author]
            authors_as_string = ", ".join(authors)
            categories = []
            if len(the_titles[0].categorys) > 0:
                ##print len(the_titles[0].categorys)
                ##print the_titles[0].categorys
                categories = [x.categoryName.format() for x in the_titles[0].categorys]
            categories_as_string = ", ".join(categories)
            if price == 0:
                if len(the_titles[0].books) > 0:
                    ListPrice = max([x.listprice for x in the_titles[0].books])
                else:
                    ListPrice = 0
            else:
                ListPrice = price
            Manufacturer = the_titles[0].publisher.format()
            Format = the_titles[0].type.format()
            Kind = the_titles[0].kind.kindName
            orig_isbn = the_titles[0].origIsbn.format()
            #            if the_titles[0].images:
            #                 large_url = the_titles[0].images.largeUrl
            #                 med_url = the_titles[0].images.medUrl
            #                 small_url = the_titles[0].images.smallUrl
            #            else:
            #                 large_url = med_url = small_url = ''
            large_url = med_url = small_url = ""

            SpecialOrders = [
                tso.id
                for tso in Title.selectBy(
                    isbn=isbn
                ).throughTo.specialorder_pivots.filter(
                    TitleSpecialOrder.q.orderStatus == "ON ORDER"
                )
            ]
            return {
                "title": ProductName,
                "authors": authors,
                "authors_as_string": authors_as_string,
                "categories_as_string": categories_as_string,
                "list_price": ListPrice,
                "publisher": Manufacturer,
                "isbn": isbn,
                "orig_isbn": orig_isbn,
                "large_url": large_url,
                "med_url": med_url,
                "small_url": small_url,
                "format": Format,
                "kind": Kind,
                "known_title": known_title,
                "special_order_pivots": SpecialOrders,
            }
        else:  # we don't have it yet
            # if we're using amazon ecs
            if use_amazon_ecs:
                sleep(1)  # so amazon doesn't get huffy
                ecs.setLicenseKey(amazon_license_key)
                ecs.setSecretAccessKey(amazon_secret_key)
                ecs.setAssociateTag(amazon_associate_tag)

                ##print "about to search", isbn, isbn[0]
                amazonBooks = []

                idType = ""
                if len(isbn) == 12:
                    idType = "UPC"
                elif len(isbn) == 13:
                    # if we are using an internal isbn
                    if isbn.startswith(internal_isbn_prefix):
                        return []
                    # otherwise search on amazon.
                    elif isbn.startswith("978") or isbn.startswith("979"):
                        idType = "ISBN"
                    else:
                        idType = "EAN"
                try:
                    print("searching amazon for ", isbn, idType, file=sys.stderr)
                    amazonProds = AmzSear(isbn)
                    print(amazonProds, file=sys.stderr)
                except (ecs.InvalidParameterValue, HTTPError):
                    pass
                if amazonProds:
                    print(amazonProds, file=sys.stderr)
                    # inner comprehension tests each prodict for price whose type is in formats
                    # if we find a price which its key is in formats, then we return the coorresponding product
                    format_list = [
                        "Paperback",
                        "Mass Market Paperback",
                        "Hardcover",
                        "Perfect Paperback",
                        "Pamphlet",
                        "Plastic Comb",
                        "Spiral-bound",
                        "Print on Demand (Paperback)",
                        "DVD",
                        "Calendar",
                        "Board book",
                        "Audio Cassette",
                        "Cards",
                        "Audio CD",
                        "Diary",
                        "DVD-ROM",
                        "Library Binding",
                        "music",
                        "Vinyl",
                        "Health and Beauty",
                        "Hardback",
                    ]
                    prods = [
                        x
                        for x in amazonProds.values()
                        if [dum for dum in x["prices"].keys() if dum in format_list]
                    ]

                    for prod1 in prods:
                        print(prod1, file=sys.stderr)
                        price_dict = prod1["prices"]
                        listprice = max(price_dict.values())

                        format = [k for k in format_list if k in price_dict]
                        format = format[0]
                        if not format:
                            continue

                        title = prod1["title"]

                        image_url = prod1["image_url"]

                        authors = [
                            x.replace("by ", "")
                            for x in prod1["subtext"]
                            if x.startswith("by ")
                        ]
                        auth_list = [
                            y.strip()
                            for a in [x.split(", ") for x in authors[0].split(" and ")]
                            for y in a
                        ]
                        # we assume any full name less than five characters is an abbreviation like 'Jr.'
                        # so we add it back to the previous authorname
                        abbrev_list = [i for i, x in enumerate(auth_list) if len(x) < 5]
                        for i in abbrev_list:
                            auth_list[i - 1 : i + 1] = [
                                ", ".join(auth_list[i - 1 : i + 1])
                            ]

                        return {
                            "title": title,
                            "authors": auth_list,
                            "authors_as_string": ",".join(auth_list),
                            "categories_as_string": "",
                            "list_price": listprice,
                            "publisher": "",
                            "isbn": isbn,
                            "orig_isbn": isbn,
                            "large_url": image_url,
                            "med_url": image_url,
                            "small_url": image_url,
                            "format": format,
                            "kind": "books",
                            "known_title": known_title,
                            "special_orders": [],
                        }

                else:
                    traceback.print_exc()
                    print("using isbnlib from ecs", file=sys.stderr)
                    isbnlibbooks = []
                    try:
                        isbnlibbooks = isbnlib.meta(str(isbn))
                    except:
                        pass

                    if isbnlibbooks:
                        return {
                            "title": isbnlibbooks["Title"],
                            "authors": isbnlibbooks["Authors"],
                            "authors_as_string": ",".join(isbnlibbooks["Authors"]),
                            "categories_as_string": None,
                            "list_price": price,
                            "publisher": isbnlibbooks["Publisher"],
                            "isbn": isbn,
                            "orig_isbn": isbn,
                            "large_url": None,
                            "med_url": None,
                            "small_url": None,
                            "format": None,
                            "kind": "books",
                            "known_title": known_title,
                            "special_orders": [],
                        }
                    else:
                        return {}
            else:  # if we're scraping amazon
                print("scraping amazon", file=sys.stderr)
                headers = {
                    "User-Agent": random.sample(user_agents, 1).pop()
                }
                amazon_url_template = "http://www.amazon.com/dp/%s/"
                if len(isbn) == 13:
                    isbn10 = None
                    if isbnlib.is_isbn13(isbn):
                        isbn10 = isbnlib.to_isbn10(isbn)
                    else:
                        return {}
                if isbn10:
                    with requests.Session() as session:
                        try:
                            print("getting amazon")
                            page_response = session.get(
                                amazon_url_template % isbn10,
                                headers=headers,
                                timeout=0.1
                            )
                            print("got response")
                            page_content = BeautifulSoup(page_response.content, "lxml")
                            print("got parsed content")
                            try:
                                booktitle = page_content.select("#productTitle").pop().text
                            except Exception as e:
                                traceback.print_exc()
                                booktitle = ''
                            popover_preload = [
                                a.text
                                for a in page_content.select(
                                    ".author.notFaded .a-popover-preload a.a-link-normal"
                                )
                            ]
                            author_name = [
                                a.text
                                for a in page_content.select(
                                    ".author.notFaded a.a-link-normal"
                                )
                                if a.text not in popover_preload
                            ]
                            contributor_role = page_content.select(".contribution span")
                            try:
                                contributor_role = [
                                    re.findall("\w+", cr.text).pop()
                                    for cr in contributor_role
                                ]
                            except Exception as e:
                                traceback.print_exc()
                                contributor_role = []
                            author_role = zip(author_name, contributor_role)
                            try:
                                listprice = (
                                    page_content.select(".a-text-strike").pop().text
                                )
                            except IndexError as e:
                                print("using bookfinder4u")
                                if "listprice" not in locals():
                                    with requests.Session() as session:
                                        bookfinderurl = "http://www.bookfinder4u.com/IsbnSearch.aspx?isbn='%s'&mode=direct"
                                        url = bookfinderurl % isbn
                                        try:
                                            page_response2 = session.get(
                                                url,
                                                headers=headers,
                                                timeout=0.1
                                            )
                                            page_content2 = BeautifulSoup(
                                                page_response2.content, "lxml"
                                            )
                                        except Exception as e:
                                            traceback.print_exc()
                                            listprice = 0.0
                                        else:
                                            try:
                                                matches = re.search(
                                                    "List\sprice:\s(\w{2,4})\s(\d+(.\d+)?)",
                                                    page_content2.text,
                                                    re.I,
                                                )
                                                if matches:
                                                    listprice = matches.groups()[1]
                                                else:
                                                    listprice = 0.00
                                            except Exception as e:
                                                traceback.print_exc()
                                                listprice = 0.00
                            try:
                                book_edition = (
                                    page_content.select("#bookEdition").pop().text
                                )
                            except Exception as e:
                                traceback.print_exc()
                                book_edition = ""
                            try:
                                matches = re.findall(
                                    "(?<=imageGalleryData'\s:\s\[)\{.*?\}",
                                    page_content.contents[1].text,
                                )
                                image_url_dict = eval(matches[0])
                            except Exception as e:
                                traceback.print_exc()
                                image_url_dict = {"mainUrl": "", "thumbUrl": ""}
                            category_items = page_content.select(".zg_hrsr_ladder a")
                            category_items = [a.text for a in category_items]
                            product_details = page_content.select(
                                "#productDetailsTable"
                            )  # ul:first-of-type")
                            try:
                                product_details1 = product_details.pop().text.splitlines()
                                quit_flag = 0
                                for pd in product_details1:
                                    if pd.endswith("pages"):
                                        format, numpages = pd.split(":")
                                        numpages = numpages.replace(" pages", "").strip()
                                        quit_flag += 1
                                        continue
                                    if pd.startswith("Publisher: "):

                                        matches = re.match(
                                            "Publisher: ([^;^(]*)\s?([^(]*)?\W(.*)\W", pd
                                        ).groups()
                                        publisher = matches[0]
                                        publication_date = matches[2]
                                        quit_flag += 1
                                        continue
                                    if quit_flag == 2:
                                        break
                                else:
                                    publisher = ''
                                    format = ''
                            except Exception as e:
                                traceback.print_exc()
                                publisher = ''
                                format = ''
                            if booktitle:
                                return {
                                    "title": booktitle,
                                    "authors": author_name,
                                    "authors_as_string": ",".join(author_name),
                                    "categories_as_string": ",".join(category_items),
                                    "list_price": listprice,
                                    "publisher": publisher,
                                    "isbn": isbn,
                                    "orig_isbn": isbn,
                                    "large_url": image_url_dict["mainUrl"],
                                    "med_url": image_url_dict["mainUrl"],
                                    "small_url": image_url_dict["thumbUrl"],
                                    "format": format,
                                    "kind": "books",
                                    "known_title": known_title,
                                    "special_orders": [],
                                }
                        except Exception as e:
                            traceback.print_exc()
                            print("using isbnlib from scraper", file=sys.stderr)
                            isbnlibbooks = []
                            try:
                                isbnlibbooks = isbnlib.meta(str(isbn))
                            except:
                                pass

                            if isbnlibbooks:
                                return {
                                    "title": isbnlibbooks["Title"],
                                    "authors": isbnlibbooks["Authors"],
                                    "authors_as_string": ",".join(
                                        isbnlibbooks["Authors"]
                                    ),
                                    "categories_as_string": None,
                                    "list_price": price,
                                    "publisher": isbnlibbooks["Publisher"],
                                    "isbn": isbn,
                                    "orig_isbn": isbn,
                                    "large_url": None,
                                    "med_url": None,
                                    "small_url": None,
                                    "format": None,
                                    "kind": "books",
                                    "known_title": known_title,
                                    "special_orders": [],
                                }
                            else:
                                return {}
                else:
                    if title:
                        return {
                            "title": title,
                            "authors": author_name,
                            "authors_as_string": ",".join(author_name),
                            "categories_as_string": ",".join(category_items),
                            "list_price": listprice,
                            "publisher": publisher,
                            "isbn": isbn,
                            "orig_isbn": isbn,
                            "large_url": image_url_dict["mainUrl"],
                            "med_url": image_url_dict["mainUrl"],
                            "small_url": image_url_dict["thumbUrl"],
                            "format": format,
                            "kind": "books",
                            "known_title": known_title,
                            "special_orders": [],
                        }
                    else:
                        return {}
    else:
        return {}
def main():
#Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
    args = parser.parse_args()
    GCIS = args.GCIS

    if GCIS is None:
        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
        print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')

    GCISPAR = parse(GCIS)
    for x in range(len(GCISPAR)):
        try:
        #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
        #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
            HREFPAR = parse(HREF)
        #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
        #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
        #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)

            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")

        #DBpedia ISBN formats
            a = ISBNS
            b = canonical(CISBN)
            c = to_isbn10(CISBN)
            d = hyphenate(to_isbn10(CISBN))
            e = to_isbn13(CISBN)
            f = hyphenate(to_isbn13(CISBN))
            g = V13
            h = "ISBN {}" .format(CISBN)
            i = "ISBN {}" .format(canonical(CISBN))
            j = "ISBN {}" .format(hyphenate(to_isbn13(CISBN)))
            k = "ISBN {}" .format(V13)
            l = "ISBN {}" .format(to_isbn10(CISBN))
            m = "ISBN {}" .format(hyphenate(to_isbn10(CISBN)))

            tests = [a,b,c,d,e,f,g,h,i,j,k,l,m]

            for indie in tests:
                r = QUERY % indie
                RQUERY(r)
                if len(RQUERY(r)) != 0:
                    print(RQUERY(r))
                    break


        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
Example #23
0
def amazon(url):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    page = requests.get(url, headers=headers)
    while True:
        sleep(3)
        try:
            parser_page = html.fromstring(page.content)
            raw_title = parser_page.xpath('//span[@id="productTitle"]//text()')
            raw_price = parser_page.xpath(
                '//span[@class="a-size-medium a-color-price offer-price a-text-normal"]'
                '//text()')
            raw_sale = parser_page.xpath(
                '//span[@class="a-size-base a-color-secondary"]//text()')
            raw_author = parser_page.xpath(
                '//a[@class="a-link-normal contributorNameID"]//text()')
            raw_category = parser_page.xpath(
                '//a[@class="a-link-normal a-color-tertiary"]//text()')
            raw_availability = parser_page.xpath(
                '//div[@id="availability"]//text()')
            ratings = parser_page.xpath('//table[@id="histogramTable"]//tr')
            reviews = parser_page.xpath(
                '//div[contains(@id,"reviews-summary")]')

            title = ''.join(''.join(raw_title).strip()) if raw_title else None
            sale = ''.join(
                ''.join(raw_sale).split()).strip() if raw_sale else None
            category = ' > '.join([i.strip() for i in raw_category
                                   ]) if raw_category else None
            price = ''.join(raw_price).strip() if raw_price else None
            availability = ''.join(
                raw_availability).strip() if raw_availability else None
            review_author = ''.join(raw_author).strip() if raw_author else None

            title_to_isbn = str(title)
            isbn = isbnlib.isbn_from_words(title_to_isbn)
            desc = str(isbnlib.desc(isbn))
            description = ''.join(desc).strip() if desc else None
            isbn10 = isbnlib.to_isbn10(isbn)
            raw_isbn13 = isbn[:3] + '-' + isbn[3:]
            isbn_13 = ''.join(raw_isbn13).strip() if raw_isbn13 else None
            isbn_10 = ''.join(isbn10).strip() if isbn10 else None

            if not reviews:
                reviews = parser_page.xpath('//div[@data-hook="review"]')

            # Rating
            ratings_dict = {}
            for ratings in ratings:
                extracted_rating = ratings.xpath('./td//a//text()')
                if extracted_rating:
                    rating_key = extracted_rating[0]
                    raw_rating_value = extracted_rating[1]
                    rating_value = raw_rating_value
                    if rating_key:
                        ratings_dict.update({rating_key: rating_value})

            # Reviews
            reviews_list = []
            for review in reviews:
                raw_review_header = review.xpath(
                    './/a[@data-hook="review-title"]//text()')
                raw_review_author = review.xpath(
                    './/a[contains(@href,"/profile/")]/parent::span//text()')
                raw_review_rating = review.xpath(
                    './/i[@data-hook="review-star-rating"]//text()')
                raw_review_posted_date = review.xpath(
                    './/a[contains(@href,"/profile/")]'
                    '/parent::span/following-sibling::span/text()')
                raw_review_text1 = review.xpath(
                    './/div[@data-hook="review-collapsed"]//text()')
                raw_review_text2 = review.xpath(
                    './/div//span[@data-action="columnbalancing-showfullreview"]'
                    '/@data-columnbalancing-showfullreview')
                raw_review_text3 = review.xpath(
                    './/div[contains(@id,"dpReviews")]/div/text()')

                review_header = ' '.join(' '.join(raw_review_header).split())
                review_author = ''.join(
                    ''.join(raw_review_author).split()).strip('By')
                review_rating = ''.join(raw_review_rating).replace(
                    'out of 5 stars', '')
                review_posted_date = dateparser.parse(
                    ''.join(raw_review_posted_date)).strftime('%d %b %Y')
                review_text = ' '.join(' '.join(raw_review_text1).split())

                if raw_review_text2:
                    json_loaded_review_data = json.loads(raw_review_text2[0])
                    json_loaded_review_data_text = json_loaded_review_data[
                        'rest']
                    cleaned_json_loaded_review_data_text = re.sub(
                        '<.*?>', '', json_loaded_review_data_text)
                    full_review_text = review_text + cleaned_json_loaded_review_data_text
                else:
                    full_review_text = review_text
                if not raw_review_text1:
                    full_review_text = ' '.join(
                        ' '.join(raw_review_text3).split())

                review_dict = {
                    'review_header': review_header,
                    'review_author': review_author,
                    'review_rating': review_rating,
                    'review_posted_date': review_posted_date,
                    'review_text': full_review_text,
                }
                reviews_list.append(review_dict)

            if not price:
                price = sale

            if page.status_code != 200:
                raise ValueError('captha')

            data = {
                'URL': url,
                'TITLE': title,
                'AUTHOR': review_author,
                'PRICE': price,
                'SALE': sale,
                'CATEGORY': category,
                'DESCRIPTION': description,
                'ISBN-10': isbn_10,
                'ISBN-13': isbn_13,
                'AVAILABILITY': availability,
                'RATING': ratings_dict,
                'REVIEW': reviews_list,
            }

            return data
        except Exception as e:
            print(e)
            if e == 'NoneType':
                return None
Example #24
0
def marcxml_parsing(x):

    # tree = ElementTree.parse("./raw_data/sample_1k_marc.xml")
    tree = ElementTree.parse(x)
    collection = tree.getroot()

    code_336 = pd.read_csv("./raw_data/336_code.csv")
    code_337 = pd.read_csv("./raw_data/337_code.csv")
    code_338 = pd.read_csv("./raw_data/338_code.csv")

    features = []  # list of features

    # range(len(collection))
    for i in range(len(collection)):
        row = {}
        print("---------------------  " + str(i))
        record = collection[i]

        leader = record.find('{http://www.loc.gov/MARC21/slim}leader')
        leader_6 = leader.text[6]
        leader_17 = leader.text[17]
        leader_18 = leader.text[18]
        # print(leader_type)
        row['leader_6'] = leader_6
        row['leader_17'] = leader_17
        row['leader_18'] = leader_18

        control = record.findall(
            '{http://www.loc.gov/MARC21/slim}controlfield')
        F006 = 0
        F007 = 0
        for c in control:
            tag = c.get('tag')
            # print(tag)

            if tag == '001':
                oclc_controlnum = c.text
                # print(physical_desc)
                row['F001_a'] = oclc_controlnum

            if tag == '006':
                F006 = F006 + 1

            if tag == '007':
                F007 = F007 + 1

            if tag == '008':
                value = c.text
                # print(value)
                pub_code = value[6]
                pub_year_1 = value[7:11]
                pub_year_2 = value[11:15]
                place = value[15:18]
                audience = value[22]
                cont_nature = value[24:28]
                government = value[28]
                literary = value[33]
                language = value[35:38]
                catalog_source = value[39]
                # print(place, language, catalog_source)
                row['F008_06'] = pub_code
                row['F008_0710'] = pub_year_1
                row['F008_1114'] = pub_year_2
                row['F008_1517'] = place
                row['F008_22'] = audience
                row['F008_2427_a'] = bool(re.search('a', cont_nature))
                row['F008_2427_b'] = bool(re.search('b', cont_nature))
                row['F008_2427_c'] = bool(re.search('c', cont_nature))
                row['F008_2427_d'] = bool(re.search('d', cont_nature))
                row['F008_2427_e'] = bool(re.search('e', cont_nature))
                row['F008_2427_f'] = bool(re.search('f', cont_nature))
                row['F008_2427_g'] = bool(re.search('g', cont_nature))
                row['F008_2427_i'] = bool(re.search('i', cont_nature))
                row['F008_2427_j'] = bool(re.search('j', cont_nature))
                row['F008_2427_k'] = bool(re.search('k', cont_nature))
                row['F008_2427_l'] = bool(re.search('l', cont_nature))
                row['F008_2427_m'] = bool(re.search('m', cont_nature))
                row['F008_2427_n'] = bool(re.search('n', cont_nature))
                row['F008_2427_o'] = bool(re.search('o', cont_nature))
                row['F008_2427_p'] = bool(re.search('p', cont_nature))
                row['F008_2427_q'] = bool(re.search('q', cont_nature))
                row['F008_2427_r'] = bool(re.search('r', cont_nature))
                row['F008_2427_s'] = bool(re.search('s', cont_nature))
                row['F008_2427_t'] = bool(re.search('t', cont_nature))
                row['F008_2427_u'] = bool(re.search('u', cont_nature))
                row['F008_2427_v'] = bool(re.search('v', cont_nature))
                row['F008_2427_w'] = bool(re.search('w', cont_nature))
                row['F008_2427_y'] = bool(re.search('y', cont_nature))
                row['F008_2427_z'] = bool(re.search('z', cont_nature))
                row['F008_2427_2'] = bool(re.search('2', cont_nature))
                row['F008_2427_5'] = bool(re.search('5', cont_nature))
                row['F008_2427_6'] = bool(re.search('6', cont_nature))
                row['F008_28'] = government
                row['F008_33'] = literary
                row['F008_3537'] = language
                row['F008_39'] = catalog_source

                if place is None:
                    row['008_1517'] = "NA"
                if language is None:
                    row['008_3537'] = "NA"
                if len(catalog_source) == 0:
                    row['008_39'] = "NA"

        row['006_is'] = 1 if F006 > 0 else 0
        row['007_is'] = 1 if F007 > 0 else 0

        data = record.findall('{http://www.loc.gov/MARC21/slim}datafield')

        F040_e = 0
        F041_is = 0
        F050_is = 0
        F082_is = 0
        F260_is = 0
        F264_is = 0
        F26x_is = 0
        F336_is = 0
        F337_is = 0
        F338_is = 0
        F490_is = 0
        F6xxa_is = 0
        F6xxv_is = 0
        F6xxy_is = 0
        F6xxz_is = 0
        isbn_list = []
        isbn_tag_list = []
        F041_a_list = []
        F041_h_list = []
        F050_a1_list = []
        F050_a2_list = []
        F082_a1_list = []
        F082_a2_list = []
        F260_b_list = []
        F260_c_list = []
        F264_b_list = []
        F264_c_list = []
        F26x_b_list = []
        F26x_c_list = []
        F336_b_list = []
        F337_b_list = []
        F338_b_list = []
        F490_a_list = []
        F6xx_a_list = []
        F6xx_v_list = []
        F6xx_y_list = []
        F6xx_z_list = []

        for d in data:
            tag = d.get('tag')
            print("---------------------  " + str(i) + "---- " + tag)

            if tag == '020':
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        isbn = s.text

                        if len(isbn) == 10 and is_isbn10(
                                str(isbn)) == True and mask(isbn) is not None:
                            isbn_text = str(isbn)
                            isbn_list.append(isbn_text)
                            isbn_tag = '--'.join(mask(isbn).split("-")[0:2])
                            isbn_tag_list.append(isbn_tag)
                        elif len(isbn) == 13 and is_isbn13(
                                str(isbn)) == True and mask(
                                    isbn) is not None and isbn[0:3] == "978":
                            isbn_text = str(isbn)
                            isbn_list.append(isbn_text)
                            isbn_tag = '--'.join(
                                mask(to_isbn10(isbn)).split("-")[0:2])
                            isbn_tag_list.append(isbn_tag)

            if tag == "040":
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'e':
                        if s.text == 'rda' or s.text == "RDA":
                            F040_e = F040_e + 1

            if tag == "041":
                F041_is = F041_is + 1
                F041_ind1 = d.get('ind1')
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F041_a_list.append(s.text)
                    if s.get('code') == 'h':
                        F041_h_list.append(s.text)

            if tag == '050':
                F050_is = F050_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        match = re.search(r'^[A-Z]{1,3}', str(s.text))
                        match2 = re.search(
                            r'^[A-Z]{1,3}[0-9]{1,}(?=\.|[A-z]|$| )',
                            str(s.text))
                        if match and match2:
                            F050_a1_list.append(match.group())
                            F050_a2_list.append(match2.group())

            if tag == '082':
                F082_is = F082_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        match = re.search(r'^[0-9]{3}', str(s.text))
                        if match:
                            F082_a1_list.append(match.group()[0])
                            F082_a2_list.append(match.group())

            if tag == '260':
                F260_is = F260_is + 1
                F26x_is = F26x_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'b':
                        F260_b_list.append(s.text)
                        F26x_b_list.append(s.text)
                        if len(
                                re.findall(
                                    "printed by |distributed by |distributed in ",
                                    s.text.lower())) > 0:
                            F260_is = F260_is - 1
                            F26x_is = F26x_is - 1
                    if s.get('code') == 'c':
                        F260_c_list.append(s.text)
                        text_26x = re.findall("\d{4}", s.text)
                        F26x_c_list.extend(text_26x)

            if tag == '264' and d.get('ind2') == '1':
                F264_is = F264_is + 1
                F26x_is = F26x_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'b':
                        F264_b_list.append(s.text)
                        F26x_b_list.append(s.text)
                        if len(
                                re.findall(
                                    "printed by |distributed by |distributed in ",
                                    s.text.lower())) > 0:
                            F264_is = F264_is - 1
                            F26x_is = F26x_is - 1
                    if s.get('code') == 'c':
                        F264_c_list.append(s.text)
                        text_26x = re.findall("\d{4}", s.text)
                        F26x_c_list.extend(text_26x)

            if tag == '336':
                F336_is = F336_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F336_b_value = s.text
                        if s.get('code') == '2':
                            F336_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_336[
                                '336_a'].values:
                            text_336b = code_336.loc[code_336['336_a'] ==
                                                     s.text, '336_b'].values[0]
                            F336_b_value = text_336b
                        if s.get('code') == '2':
                            F336_2_value = s.text
                if "rda" in F336_2_value.lower():
                    F336_b_list.append(F336_b_value)

            if tag == '337':
                F337_is = F337_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F337_b_value = s.text
                        if s.get('code') == '2':
                            F337_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_337[
                                '337_a'].values:
                            text_337b = code_337.loc[code_337['337_a'] ==
                                                     s.text, '337_b'].values[0]
                            F337_b_value = text_337b
                        if s.get('code') == '2':
                            F337_2_value = s.text
                if "rda" in F337_2_value.lower():
                    F337_b_list.append(F337_b_value)

            if tag == '338':
                F338_is = F338_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F338_b_value = s.text
                        if s.get('code') == '2':
                            F338_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_338[
                                '338_a'].values:
                            text_338b = code_338.loc[code_338['338_a'] ==
                                                     s.text, '338_b'].values[0]
                            F338_b_value = text_338b
                        if s.get('code') == '2':
                            F338_2_value = s.text
                if "rda" in F338_2_value.lower():
                    F338_b_list.append(F338_b_value)

            if tag == '490':
                F490_is = F490_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F490_a_list.append(s.text)

            if tag in ['600', '610', '611', '630', '650'
                       ] and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxa_is = F6xxa_is + 1
                        F6xx_a_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

            if tag == "651" and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

            if tag == "655" and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

        # print(code)
        # print(value)

        isbn_list1 = set(isbn_list)
        isbn_tag_list1 = set(isbn_tag_list)
        if (len(isbn_tag_list) > 0):
            row['isbn'] = " ;; ".join(set(isbn_list1))
            row['isbn_tag'] = " ;; ".join(set(isbn_tag_list1))
            row['isbn1'] = isbn_list[0]
            row['isbn_tag1'] = isbn_tag_list[0]
        else:
            row['isbn'] = "NA"
            row['isbn_tag'] = "NA"
            row['isbn1'] = "NA"
            row['isbn_tag1'] = "NA"

        if F040_e > 0:
            row['F040_e'] = 1
        else:
            row['F040_e'] = 0

        if F041_is > 0:
            row['F041_ind1'] = F041_ind1
            row['F041_a'] = " ;; ".join(F041_a_list)
            row['F041_h'] = " ;; ".join(F041_h_list)
        else:
            row['F041_ind1'] = "NA"
            row['F041_a'] = "NA"
            row['F041_h'] = "NA"

        if len(F050_a1_list) > 0:
            row['F050_a1'] = " ;; ".join(set(F050_a1_list))
            row['F050_a2'] = " ;; ".join(set(F050_a2_list))
        else:
            row['F050_a1'] = "NA"
            row['F050_a2'] = "NA"

        if len(F082_a1_list) > 0:
            row['F082_a1'] = " ;; ".join(set(F082_a1_list))
            row['F082_a2'] = " ;; ".join(set(F082_a2_list))
        else:
            row['F082_a1'] = "NA"
            row['F082_a2'] = "NA"

        row['F260_is'] = F260_is
        if F260_is > 0:
            row['F260_b'] = " ;; ".join(F260_b_list)
            row['F260_c'] = " ;; ".join(F260_c_list)
        else:
            row['F260_b'] = "NA"
            row['F260_c'] = "NA"

        row['F264_is'] = F264_is
        if F264_is > 0:
            row['F264_b'] = " ;; ".join(F264_b_list[0:0 + F26x_is])
            row['F264_c'] = " ;; ".join(F264_c_list)
        else:
            row['F264_b'] = "NA"
            row['F264_c'] = "NA"

        row['F26x_is'] = F26x_is
        if F26x_is > 0:
            row['F26x_b'] = " ;; ".join(set(F26x_b_list[0:0 + F26x_is]))
            row['F26x_c'] = " ;; ".join(set(F26x_c_list))
        else:
            row['F26x_b'] = "NA"
            row['F26x_c'] = "NA"

        if F336_is > 0:
            F336_b_text = F336_b_list
            row['F336_b'] = " ;; ".join(F336_b_text)
            row['F336_b_txt'] = bool(re.search('txt', row['F336_b']))
            row['F336_b_sti'] = bool(re.search('sti', row['F336_b']))
            row['F336_b_cri'] = bool(re.search('cri', row['F336_b']))
            row['F336_b_spw'] = bool(re.search('spw', row['F336_b']))
            row['F336_b_tct'] = bool(re.search('tct', row['F336_b']))
        else:
            row['F336_b'] = "NA"
            row['F336_b_txt'] = ""
            row['F336_b_sti'] = ""
            row['F336_b_cri'] = ""
            row['F336_b_spw'] = ""
            row['F336_b_tct'] = ""

        if F337_is > 0:
            F337_b_text = F337_b_list
            row['F337_b'] = " ;; ".join(F337_b_text)
            row['F337_b_c'] = bool(re.search('c', row['F337_b']))
            row['F337_b_h'] = bool(re.search('h', row['F337_b']))
            row['F337_b_n'] = bool(re.search('n', row['F337_b']))
            row['F337_b_s'] = bool(re.search('s', row['F337_b']))
        else:
            row['F337_b'] = "NA"
            row['F337_b_c'] = ""
            row['F337_b_h'] = ""
            row['F337_b_n'] = ""
            row['F337_b_s'] = ""

        if F338_is > 0:
            F338_b_text = F338_b_list
            row['F338_b'] = " ;; ".join(F338_b_text)
            row['F338_b_cd'] = bool(re.search('cd', row['F338_b']))
            row['F338_b_cr'] = bool(re.search('cr', row['F338_b']))
            row['F338_b_hd'] = bool(re.search('hd', row['F338_b']))
            row['F338_b_he'] = bool(re.search('he', row['F338_b']))
            row['F338_b_nb'] = bool(re.search('nb', row['F338_b']))
            row['F338_b_sd'] = bool(re.search('sd', row['F338_b']))
        else:
            row['F338_b'] = "NA"
            row['F338_b_cd'] = ""
            row['F338_b_cr'] = ""
            row['F338_b_hd'] = ""
            row['F338_b_he'] = ""
            row['F338_b_nb'] = ""
            row['F338_b_sd'] = ""

        if F490_is > 0:
            row['F490_a'] = " ;; ".join(F490_a_list)
        else:
            row['F490_a'] = "NA"

        if F6xxa_is > 0:
            row['F6xx_a'] = " ;; ".join(set(F6xx_a_list))
        else:
            row['F6xx_a'] = "NA"

        if F6xxv_is > 0:
            row['F6xx_v'] = " ;; ".join(set(F6xx_v_list))
        else:
            row['F6xx_v'] = "NA"

        if F6xxy_is > 0:
            row['F6xx_y'] = " ;; ".join(set(F6xx_y_list))
        else:
            row['F6xx_y'] = "NA"

        if F6xxz_is > 0:
            row['F6xx_z'] = " ;; ".join(set(F6xx_z_list))
        else:
            row['F6xx_z'] = "NA"

        features.append(row)

    df = pd.DataFrame(features)
    return (df)
Example #25
0
    def update_cover(self):

        image = requests.get('http://images.amazon.com/images/P/%s.01._SS500_SCLZZZZZZZ_.jpg' % (isbnlib.to_isbn10(str(self.isbn)), ))

        if image.status_code == 200 and len(image.content) > 50:
            img_temp = NamedTemporaryFile(delete=True)
            img_temp.write(image.content)
            img_temp.flush()

            self.cover.save('%s.jpg' % (self.isbn,), File(img_temp))

        else:
            self.cover.delete()
Example #26
0
def enrich_document_data(sender, json=None, record=None, index=None,
                         doc_type=None, arguments=None, **dummy_kwargs):
    """Signal sent before a record is indexed.

    :param json: The dumped record dictionary which can be modified.
    :param record: The record being indexed.
    :param index: The index in which the record will be indexed.
    :param doc_type: The doc_type for the record.
    """
    if index.split('-')[0] == DocumentsSearch.Meta.index:
        # HOLDINGS
        holdings = []
        document_pid = record['pid']
        es_holdings = HoldingsSearch()\
            .filter('term', document__pid=document_pid)\
            .scan()
        for holding in es_holdings:
            holding = holding.to_dict()
            hold_data = {
                'pid': holding['pid'],
                'location': {
                    'pid': holding['location']['pid'],
                },
                'circulation_category': [{
                    'pid': holding['circulation_category']['pid']
                }],
                'organisation': {
                    'organisation_pid': holding['organisation']['pid'],
                    'library_pid': holding['library']['pid']
                }
            }
            # Index additional holdings fields into the document record
            holdings_fields = [
                'call_number', 'second_call_number', 'index',
                'enumerationAndChronology', 'supplementaryContent',
                'local_fields'
            ]
            for field in holdings_fields:
                if field in holding:
                    hold_data[field] = holding.get(field)
            # Index holdings notes
            notes = [n['content'] for n in holding.get('notes', []) if n]
            if notes:
                hold_data['notes'] = notes

            # Index items attached to each holdings record
            es_items = ItemsSearch()\
                .filter('term', holding__pid=holding['pid'])\
                .scan()
            for item in es_items:
                item = item.to_dict()
                item_data = {
                    'pid': item['pid'],
                    'barcode': item['barcode'],
                    'status': item['status'],
                    'local_fields': item.get('local_fields'),
                    'call_number': item.get('call_number'),
                    'second_call_number': item.get('second_call_number'),
                    'temporary_item_type': item.get('temporary_item_type')
                }

                if 'temporary_item_type' in item:
                    hold_data['circulation_category'].append(
                        {'pid': item['temporary_item_type']['pid']})

                item_data = {k: v for k, v in item_data.items() if v}

                # item acquisition part.
                #   We need to store the acquisition data of the items into the
                #   document. As we need to link acquisition date and
                #   org/lib/loc, we need to store theses data together in a
                #   'nested' structure.
                acq_date = item.get('acquisition_date')
                if acq_date:
                    item_data['acquisition'] = {
                        'organisation_pid': holding['organisation']['pid'],
                        'library_pid': holding['library']['pid'],
                        'location_pid': holding['location']['pid'],
                        'date': acq_date
                    }
                # item notes content.
                #   index the content of the public notes into the document.
                public_notes_content = [
                    n['content']
                    for n in item.get('notes', [])
                    if n['type'] in ItemNoteTypes.PUBLIC
                ]
                if public_notes_content:
                    item_data['notes'] = public_notes_content
                hold_data.setdefault('items', []).append(item_data)
            holdings.append(hold_data)

        if holdings:
            json['holdings'] = holdings

        # MEF contribution ES index update
        contributions = create_contributions(json.get('contribution', []))
        if contributions:
            json.pop('contribution', None)
            json['contribution'] = contributions
        # TODO: compare record with those in DB to check which authors have
        # to be deleted from index
        # Index host document title in child document (part of)
        if 'partOf' in record:
            title = {'type': 'partOf'}
            for part_of in record['partOf']:
                doc_pid = extracted_data_from_ref(
                    part_of.get('document')
                )
                document = Document.get_record_by_pid(doc_pid)
                for part_of_title in document.get('title', []):
                    if 'mainTitle' in part_of_title:
                        title['partOfTitle'] = part_of_title.get(
                            'mainTitle'
                        )
            json['title'].append(title)

        # sort title
        sort_title = title_format_text_head(
            json.get('title', []),
            with_subtitle=True
        )
        language = language_mapping(json.get('language')[0].get('value'))
        if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
            sort_title = current_app.\
                extensions['reroils-normalizer-stop-words'].\
                normalize(sort_title, language)
        json['sort_title'] = sort_title
        # Local fields in JSON
        local_fields = LocalField.get_local_fields_by_resource(
            'doc', document_pid)
        if local_fields:
            json['local_fields'] = local_fields

        # index both ISBN 10 and 13 format
        def filter_isbn(identified_by):
            """Filter identified_by for type bf:Isbn."""
            return identified_by.get('type') == 'bf:Isbn'

        filtered_identified_by = filter(
            filter_isbn,
            json.get('identifiedBy', [])
        )
        isbns = set()
        for identified_by in filtered_identified_by:
            isbn = identified_by['value']
            isbns.add(isbn)
            if is_isbn10(isbn):
                isbns.add(to_isbn13(isbn))
            elif is_isbn13(isbn):
                isbns.add(to_isbn10(isbn))
        if isbns:
            json['isbn'] = list(isbns)

        # Populate sort date new and old for use in sorting
        pub_provisions = [
            p for p in record.get('provisionActivity', [])
            if p['type'] == 'bf:Publication'
        ]
        pub_provision = next(iter(pub_provisions), None)
        if pub_provision:
            json['sort_date_new'] = \
                pub_provision.get('endDate', pub_provision.get('startDate'))
            json['sort_date_old'] = pub_provision.get('startDate')
def com_isbn_13_to_10(isbn_string):
    return isbnlib.to_isbn10(isbn_string)