Beispiel #1
0
    async def check(self, entry):
        length = self._cfg.get('isbn_length', entry, 13)
        if not length:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if length not in (10, 13):
            raise ConfigurationError(
                "The option 'isbn_length' must be either of 10 or 13.")

        if length == 10:
            if not is_isbn10(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 10.".format(isbn),
                         "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn)))
                        ]
        elif length == 13:
            if not is_isbn13(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 13.".format(isbn),
                         "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn)))
                        ]

        return []
Beispiel #2
0
	def isbn(self,isbn):
		#adds isbn to google spread sheet
		
		#check if valid
		clean_isbn = isbnlib.clean(isbn)
		if isbnlib.notisbn(clean_isbn):
			return "not valid isbn"
		
		#should check if has been collected before

		canonical = None;
		#first check trove
		canonical = self.trove.extract(clean_isbn);
		if not canonical :
			# try alternative isbn form
			print "trying alternative form "
			alt_isbn = clean_isbn;
			if isbnlib.is_isbn13(clean_isbn):
				alt_isbn = isbnlib.to_isbn10(clean_isbn)
			else :
				alt_isbn = isbnlib.to_isbn13(clean_isbn)
			canonical = self.trove.extract(alt_isbn);
			if canonical :
				clean_isbn = alt_isbn
		if not canonical :
			canonical = self.__reduce_metadata(clean_isbn,['merge','isbndb','openl'])
			if not canonical:
				return "no metadata found for isbn: " + clean_isbn
			canonical['source']='isbnlib'
			canonical["Authors"] = u', '.join(canonical["Authors"])
			canonical['link']=None

		row_data = ['isbn:'+clean_isbn, canonical["Title"], canonical["Authors"], canonical["Year"], canonical["Publisher"],canonical['link']]
		return self.__add_and_render(row_data)
def main():
    #Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-apikey', '--isbndbkey', help="Insert ISBNDB apikey")
    args = parser.parse_args()

    if args.isbndbkey:
        print(args.isbndbkey)
    else:
        print('NO MANUAL API KEY')

    for x in range(len(GCISPAR)):
        try:
            #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match = re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format(
                FILETYPE, IDEN)
            HREFPAR = parse(HREF)
            #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
            #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
            #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)
            apikey = args.isbndbkey
            if args.isbndbkey is None:
                apikey = 'XOATAY1G'
            data = 'http://isbndb.com/api/v2/json/{}/book/{}'.format(
                apikey, V13)
            v = parse(data)
            GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n".format(
                M, ISBNS, V13)
            APIDATA = "ISBNDB\n\n\t{}\n\n------------\n\n".format(v)
            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS,
                  '\n\n\t', "isbn_mod:", V13, "\n\n")
            print("ISBNDB\n\n\t", v, '\n\n')
            if v['error']:
                file.write(v['error'] + "\n")
            else:
                pass


#Writing Metadata onto file2
            file2.write(GCISDATA)
            file2.write(APIDATA)

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(
                TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
Beispiel #4
0
    async def check(self, entry):
        fmt = self._cfg.get('isbn_format', entry)
        if not fmt:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if fmt not in ('canonical', 'masked'):
            raise ConfigurationError("The option 'isbn_format' must be \
                either of 'canonical' or 'masked'.")

        if fmt == 'canonical':
            cisbn = canonical(clean_isbn)
            if cisbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in canonical format.".format(isbn),
                         "Canonical format would be '{}'".format(cisbn))]
        elif fmt == 'masked':
            misbn = mask(clean_isbn)
            if misbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in masked format.".format(isbn),
                         "Masked format would be '{}'".format(misbn))]

        return []
Beispiel #5
0
def validate_book_data(book_data):
    """Checks to see if the given book data is valid

    Args:
        book_data (dict): The book data to validate. Required keys are 'isbn', 'title', and 'authors'

    Raises:
        InvalidRequest: If 'isbn', 'title', and 'authors' do not appear as keys in book_data
        InvalidRequest: If the value for key 'authors' is not a list of strings
        InvalidRequest: If the isbn is not valid. See https://en.wikipedia.org/wiki/International_Standard_Book_Number#Check_digits
        ResourceExists: If a book with the provided isbn already exists

    Returns:
        dict: The validated book data
    """
    isbn = book_data.get("isbn")
    title = book_data.get("title")
    authors = book_data.get("authors")

    # Ensure request is valid format
    if not (title and isbn and authors):
        raise InvalidRequest(
            "Request should be of the form {{isbn: 'isbn', title: 'title', authors: [author1, author2,]}}"
        )

    # Check if isbn is valid
    if not (isbn := isbnlib.to_isbn13(isbnlib.clean(isbn))):
        raise InvalidRequest(
            "The isbn provided is not valid or could not be converted into isbn-13 format"
        )
Beispiel #6
0
def get_title(isbn_numbers):

    for isbn in isbn_numbers:
        cleaned_isbn = isbnlib.clean(isbn)
        if (isbnlib.is_isbn13(cleaned_isbn)
                or isbnlib.is_isbn10(cleaned_isbn)):
            book = isbnlib.meta(cleaned_isbn)
            return book['Title']
            break
def main():
#Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-apikey', '--isbndbkey', help = "Insert ISBNDB apikey")
    args = parser.parse_args()

    if args.isbndbkey:
        print(args.isbndbkey)
    else:
        print('NO MANUAL API KEY')

    for x in range(len(GCISPAR)):
        try:
    #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
            HREFPAR = parse(HREF)
    #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
    #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
    #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)
            apikey = args.isbndbkey
            if args.isbndbkey is None:
                apikey = 'XOATAY1G'
            data = 'http://isbndb.com/api/v2/json/{}/book/{}'.format(apikey, V13)
            v = parse(data)
            GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13)
            APIDATA = "ISBNDB\n\n\t{}\n\n------------\n\n" .format(v)
            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")
            print ("ISBNDB\n\n\t", v, '\n\n')
            if v['error']:
                file.write(v['error']+"\n")
            else:
                pass
#Writing Metadata onto file2
            file2.write(GCISDATA)
            file2.write(APIDATA)

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
Beispiel #8
0
def isISBN(input_str):
    ''' Checks if the given string is an ISBN number. '''

    if isbnlib.is_isbn10(input_str):
        return True
    if isbnlib.is_isbn13(input_str):
        return True

    input_str_clean = isbnlib.clean(input_str)

    if isbnlib.is_isbn10(input_str_clean):
        return True
    if isbnlib.is_isbn13(input_str_clean):
        return True

    return False
def update_catalog():
    file = 'updatedlibrarycatalog.csv'
    catalog = pd.read_csv(file)
    regFormat = bibformatters['default']
    provider = 'loc'
    prevMillis = int(round(time.time() * 1000))

    print(catalog.shape)
    print()

    n = 0
    t = 0
    f = 0

    for isbnNumber in catalog['ISBN']:
        try:
            finalISBN = str(int(isbnNumber))
            cleanISBN = isbn.clean(finalISBN)
            FINAL_ISBN = cleanISBN
            try:
                try:
                    isbnInfo = meta(cleanISBN, service=provider)
                    regFormat(isbnInfo)
                except:
                    print('Library of Congress Service Does Not Work')
                    try:
                        provider = 'goob'
                        isbnInfo = meta(cleanISBN, service=provider)
                        regFormat(isbnInfo)
                    except:
                        print('Google Service Does Not Work')
                        try:
                            provider = 'openl'
                            isbnInfo = meta(cleanISBN, service=provider)
                            regFormat(isbnInfo)
                        except:
                            print('OpenL Service Does Not Work')
                            raise ValueError()
            except:
                print("ISBN13 Num Does Not Work")
                tempValue = catalog.loc[catalog['ISBN'] ==
                                        isbnNumber]['ISBN_10'].values
                isbn10Num = tempValue[0]
                finalISBN10 = str(int(float(isbn10Num)))
                cleanISBN10 = isbn.clean(finalISBN10)
                FINAL_ISBN = cleanISBN10
                try:
                    provider = 'loc'
                    isbnInfo = meta(cleanISBN10, service=provider)
                    regFormat(isbnInfo)
                except:
                    print('Library of Congress Service Does Not Work')
                    try:
                        provider = 'goob'
                        isbnInfo = meta(cleanISBN10, service=provider)
                        regFormat(isbnInfo)
                    except:
                        print('Google Service Does Not Work')
                        try:
                            isbnInfo = meta(cleanISBN10, service='openl')
                            regFormat(isbnInfo)
                        except:
                            print('OpenL Service Does Not Work')
                            websiteURL = "https://isbnsearch.org/search?s=" + cleanISBN
                            website = urllib.request.urlopen(websiteURL)
                            mybytes = website.read()
                            dataFromWebsite = mybytes.decode("utf8")
                            website.close()
                            print(dataFromWebsite)
                            raise ValueError()

            print(regFormat(isbnInfo))

            n = n + 1

            catalog.loc[t, 'ISBN'] = isbnNumber

            bookTitle = isbnInfo['Title']
            catalog.loc[t, 'Title'] = bookTitle

            bookLanguage = isbnInfo['Language']
            catalog.loc[t, 'Language'] = bookLanguage

            bookPublisher = isbnInfo['Publisher']
            catalog.loc[t, 'Publisher'] = bookPublisher

            bookYear = isbnInfo['Year']
            catalog.loc[t, 'Year'] = bookYear

            bookAuthor = isbnInfo['Authors'][0]
            catalog.loc[t, 'Authors'] = bookAuthor

            try:
                bookDesc = isbn.desc(FINAL_ISBN)
                catalog.loc[t, 'Description'] = bookDesc
            except:
                print('Could not extract book description')

            try:
                bookCover = isbn.cover(FINAL_ISBN)
                catalog.loc[t, 'Cover'] = bookCover['thumbnail']
            except:
                print('Could not extract book cover link')

        except:
            print("This ISBN Number is NOT Valid")
            f = f + 1
        t = t + 1
        currentMillis = int(round(time.time() * 1000))
        if currentMillis - prevMillis >= 60000:
            print("Saving File to Local and Remove Server")
            catalog.to_csv('updatedlibrarycatalog.csv')
            ftp.push_file_to_server()
            prevMillis = int(round(time.time() * 1000))
            print("File has Successfully Saved")
        print("\n")

    print('Total Number of Books: ' + str(t))
    print('Successful Books: ' + str(n))
    print('Unsuccessful Books: ' + str(f))
Beispiel #10
0
def main():
    #Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-path',
        '--GCIS',
        help=
        "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] "
    )
    args = parser.parse_args()
    GCIS = args.GCIS

    if GCIS is None:
        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
        print(
            'NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT'
        )

    GCISPAR = parse(GCIS)
    for x in range(len(GCISPAR)):
        try:
            #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match = re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
        #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format(
                FILETYPE, IDEN)
            HREFPAR = parse(HREF)
            #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
            #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
            #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)

            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS,
                  '\n\n\t', "isbn_mod:", V13, "\n\n")

            #DBpedia ISBN formats
            a = ISBNS
            b = canonical(CISBN)
            c = to_isbn10(CISBN)
            d = hyphenate(to_isbn10(CISBN))
            e = to_isbn13(CISBN)
            f = hyphenate(to_isbn13(CISBN))
            g = V13
            h = "ISBN {}".format(CISBN)
            i = "ISBN {}".format(canonical(CISBN))
            j = "ISBN {}".format(hyphenate(to_isbn13(CISBN)))
            k = "ISBN {}".format(V13)
            l = "ISBN {}".format(to_isbn10(CISBN))
            m = "ISBN {}".format(hyphenate(to_isbn10(CISBN)))

            tests = [a, b, c, d, e, f, g, h, i, j, k, l, m]

            for indie in tests:
                r = QUERY % indie
                RQUERY(r)
                if len(RQUERY(r)) != 0:
                    print(RQUERY(r))
                    break

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(
                TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
Beispiel #11
0
def main():
    #Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-log',
                        '--login',
                        help="Route path to Gcis.conf YAML file")
    parser.add_argument(
        '-url',
        '--gcis',
        help=
        'INSERT EITHER: https://data.globalchange.gov OR https://gcis-search-stage.jpl.net:3000'
    )
    parser.add_argument('-name', '--username', help="Insert GCIS username")
    parser.add_argument('-pw',
                        '--apikey',
                        help="Insert GCIS username's api key")
    args = parser.parse_args()
    gcis = 'https://data.globalchange.gov'
    gcisdev = 'https://gcis-search-stage.jpl.net:3000'

    #Extracts login info from Gcis.conf
    if args.login:
        a = open(args.login, "r")
        list = (yaml.load(a))
        diction = list[0]
        path = diction['url']
        user = diction['userinfo']
        key = diction['key']
        print(path + '\n' + user + '\n' + key)
    else:
        pass
    if args.gcis == gcis:
        print(args.gcis)
    elif args.gcis == gcisdev:
        print(args.gcis)
    else:
        print('NO MANUAL ENDPOINT (Ignore if using Config file)')
    if args.username:
        print(args.username)
    else:
        print('NO MANUAL USERNAME (Ignore if using Config file)')
    if args.apikey:
        print(args.apikey)
    else:
        print('NO MANUAL API KEY (Ignore if using Config file)')

        #Credentials

        path = diction['url']
        if diction['url'] is None:
            path = args.gcis
        else:
            path = gcisdev

        user = diction['userinfo']
        if diction['userinfo'] is None:
            user = args.username

        key = diction['key']
        if diction['key'] is None:
            key = args.apikey


#Parses url.json#

    def parse(url):
        import requests
        r = requests.get(url, verify=False)
        JSONdict = r.json()
        return JSONdict

    GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
    GCISPAR = parse(GCIS)

    for x in range(len(GCISPAR)):
        #Extracts book identifier from GCIS#
        IDEN = GCISPAR[x]["identifier"]
        match = re.search(r'.*/(.*?)\..*?$', GCIS)
        if match:
            FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
        HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json'.format(
            FILETYPE, IDEN)
        #HREF for either GCIS or GCIS-DEV
        #HREF = '{}//{}/{}.json' .format(path, FILETYPE, IDEN)
        #test
        #HREF = 'https://gcis-search-stage.jpl.net:3000/book/305e4144-39d2-4d84-8843-3f502ab890e0.json'
        HREFPAR = parse(HREF)
        print(HREFPAR)
        #Extracts book title and isbn from GCIS-DEV
        d = dict(HREFPAR)
        TITLE = d['title']
        ISBNS = d['isbn']
        #Cleans ISBNS to only conatian valid characters
        CISBN = clean(ISBNS)
        #V13 = validated canonical ISBN-13
        V13 = EAN13(CISBN)
        if V13 is None:
            V13 = canonical(CISBN)
        M = parse(HREF)
        #For possible future implementation of adding original isbn into the JSON dictionary.
        """M["isbn"] = V13
            M["org_isbn"] = ISBNS"""
        print(M, '\n\t', "isbn_original:", ISBNS)
        #Posts updated JSON dictionary back into GCIS-DEV using credentials from command line arguments.
        s = requests.Session()
        s.auth = (user, key)
        s.headers.update({'Accept': 'application/json'})
        r = s.post(HREF, data=M, verify=False)
        r.raise_for_status()
        sys.exit()
Beispiel #12
0
def main():
#Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-awsid', '--AWSAccessKeyID', help = "Insert AWS Access Key ID")
    parser.add_argument('-astag', '--AssociateTag', help = "Insert Amazon Associate Tag")
    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
    args = parser.parse_args()
    GCIS = args.GCIS

    if args.AWSAccessKeyID:
        print(args.AWSAccessKeyID)
    else:
        print('NO AWS Access Key ID')

    if args.AssociateTag:
        print(args.AssociateTag)
    else:
        print('NO Amazon Associate Tag')

    if GCIS is None:
        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
        print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')

    GCISPAR = parse(GCIS)


    for x in range(len(GCISPAR)):
        try:
    #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
            HREFPAR = parse(HREF)
    #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
    #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
    #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)
    #MV13 = M["isbn"] = V13
    #ORGISBN = M["org_isbn"] = ISBNS
            locapi = 'http://lx2.loc.gov:210/lcdb?version=1.1&operation=searchRetrieve&query=bath.isbn={}&maximumRecords=1&recordSchema=mods' .format(V13)
            results = xmlparse(locapi)
            GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13)
            APIDATA = "AMAZON\n\n\t{}\n\n------------\n\n" .format(results)
            print("GCIS-DEV\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")
            print('AMAZON\n\t',results)
            file2.write(GCISDATA)
            file2.write(APIDATA)

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
#Extracts book identifier from GCIS#
        IDEN = GCISPAR[x]["identifier"]
        match =  re.search(r'.*/(.*?)\..*?$', GCIS)
        if match:
            FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
        try:
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
            #HREF = 'https://gcis-search-stage.jpl.net:3000/book/13b8b4fc-3de1-4bd8-82aa-7d3a6aa54ad5.json'
            HREFPAR = parse(HREF)
    #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
    #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
    #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)
            v = meta(V13, service = 'wcat', cache ='default')
            GCISDATA = "GCIS-DEV\n\n\t{}\n\n\tisbn_original:{}\n\n\tisbn_mod:{}\n\n" .format(M, ISBNS, V13)
            APIDATA = "WorldCat\n\n\t{}\n\n------------\n\n" .format(v)
            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")
            print ("WorldCat\n\n\t", v, '\n\n')
            file2.write(GCISDATA)
            file2.write(APIDATA)

        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
Beispiel #14
0
def importar_acervo_bibliotecario(args, input_file, col_isbn, cols_fonte,
                                  cols_destino):

    cols_fonte = cols_fonte.split(',')
    cols_destino = cols_destino.split(',')

    if os.path.isfile(JSON_ACERVO_UNIFICADO):
        df_unificado = pd.read_json(JSON_ACERVO_UNIFICADO, dtype=str)
    else:
        df_unificado = pd.DataFrame(columns=COLUNAS_ACERVO_UNIFICADO)

    df = pd.DataFrame()
    extensao_arquivo = re.search('\.(.*)', input_file).group()

    if extensao_arquivo == '.csv':
        df = pd.read_csv(input_file, dtype=str)
        df = df[cols_fonte]
    elif extensao_arquivo == '.xlsx':
        df = pd.read_excel(input_file, dtype=str)
        df = df[cols_fonte]
    else:
        print(
            "ERRO: Arquivo a ser importado deve estar em formato CSV ou XLSX.")
        return

    if len(cols_fonte) != len(cols_destino):
        print(
            "ERRO: listas de colunas fonte e destino devem ter mesmo tamanho.")
        return
    elif len(cols_fonte) == 0:
        print(
            "ERRO: listas de colunas fonte e destino devem ter ser não vazias."
        )
        return

    # verifica se colunas existem no esquema unificado.
    for col in cols_destino:
        if col not in COLUNAS_ACERVO_UNIFICADO:
            print("ERRO: %s não é uma coluna do esquema do acervo unificado." %
                  col)
            return

    # verifica se colunas existem no esquema unificado.
    for col in cols_fonte:
        if col not in df.columns:
            print(
                "ERRO: %s não é uma coluna do esquema do acervo de entrada." %
                col)
            return

    if args.v:
        print("Esquema do acervo de entrada: ", cols_fonte)
        print("Projeção do esquema do acervo unificado: ", cols_destino)

    #
    # Cada coluna no acervo de entrada corresponde a uma coluna no acervo unificado.
    # O trecho a seguir renomeias as colunas do acervo de entrada usando os nomes de
    # colunas correspondentes no acervo unificado. Essa transformação é útil pois
    # facilita a atualização do acervo unificado (que é realizada no desta função).
    #
    mapeamento_colunas = dict()
    for j in range(len(cols_fonte)):
        mapeamento_colunas[cols_fonte[j]] = cols_destino[j]
    df.rename(columns=mapeamento_colunas, inplace=True)

    # Determina quais entradas são válidas e quais são inválidas no arquivo de entrada
    # (uma entrada é considerada válida se e somente se corresponde a um valor de ISBN válido)
    indices_entradas_invalidas = []
    indices_entradas_validas = []
    for index, row in df.iterrows():
        isbn = str(row['isbn13'])
        isbn = il.clean(isbn)
        isbn = il.to_isbn13(isbn)
        if isbn == 'nan' or isbn is None:
            indices_entradas_invalidas.append(index)
        else:
            indices_entradas_validas.append(index)

    # Filtra arquivo de entrada. Após essa operação, o arquivo possui apenas entradas válidas.
    df.drop(df.index[indices_entradas_invalidas], inplace=True)

    #
    # Dada uma entrada no arquivo a ser importado, ou essa entrada já existe no arquivo unificado, ou não.
    # O trecho a seguir identifica isso para cada entrada válida do arquivo a ser importado.
    #
    isbns_no_acervo_unif = df_unificado['isbn13'].tolist()

    # 'sanity check': o acervo unificado não pode conter entradas duplicadas.
    if len(set(isbns_no_acervo_unif)) != len(isbns_no_acervo_unif):
        print("ERRO GRAVE: acervo unificado contém duplicatas!")
        print(len(set(isbns_no_acervo_unif)) - len(isbns_no_acervo_unif))
        print([
            item for item, count in collections.Counter(
                isbns_no_acervo_unif).items() if count > 1
        ])
        return

    isbns_no_arquivo_entrada = []
    temp = df['isbn13'].tolist()
    for t in temp:
        isbn = str(t)
        isbn = il.clean(isbn)
        isbn = il.to_isbn13(isbn)
        isbns_no_arquivo_entrada.append(isbn)

    df['isbn13'] = isbns_no_arquivo_entrada
    df.sort_values('isbn13', inplace=True)
    df.drop_duplicates(subset="isbn13", inplace=True)

    # 'sanity check': não queremos importar a mesma entrada mais de uma vez.
    isbns_no_arquivo_entrada = df['isbn13'].tolist()
    if len(set(isbns_no_arquivo_entrada)) != len(isbns_no_arquivo_entrada):
        print("ERRO GRAVE: acervo de entrada contém duplicatas!")
        print(
            len(indices_entradas_validas) - len(set(isbns_no_arquivo_entrada)))
        print([
            item for item, count in collections.Counter(
                isbns_no_arquivo_entrada).items() if count > 1
        ])
        return

    entradas_ja_existentes = []
    entradas_novas = []

    isbns_no_acervo_entrada = df['isbn13'].tolist()

    for index, row in df.iterrows():
        isbn = str(row['isbn13'])
        isbn = il.clean(isbn)

        if il.is_isbn10(isbn):
            isbn13 = il.to_isbn13(isbn)
            if isbn13 in isbns_no_acervo_entrada:
                print(
                    "AVISO: duplicata no arquivo de entrada (cadastro com ISBN10 e ISBN13): %s"
                    % isbn)
                return

        isbn = il.to_isbn13(isbn)

        if isbn == 'nan' or isbn is None:
            continue

        try:
            indice = isbns_no_acervo_unif.index(isbn)
        except ValueError:
            indice = -1

        if indice >= 0:
            entradas_ja_existentes.append(index)
        else:
            entradas_novas.append(index)
            # print('***ERROR***')
            # print('row:', row)
            # print('row[isbn13]:', row['isbn13'])
            # print('isbn:', isbn)
            # return

    entradas_invalidas = 0
    entradas_atualizadas = 0
    entradas_inseridas = 0

    # cria dois novos dataframes, um para entradas novas e outro para entradas já existentes.
    df_inserir = df.loc[entradas_novas]
    df_atualizar = df.loc[entradas_ja_existentes]

    #
    # No acervo unificado, cada entrada é identificada pelo ISBN13. Portanto o trecho a seguir faz o mapeamento
    # dos valores de isbn provenientes do arquivo de entrada (que podem estar no formato ISBN10 ou ISBN13) para
    # ISBN13.
    #
    isbns_novos = df_inserir['isbn13'].tolist()
    isbns_existentes = df_atualizar['isbn13'].tolist()

    isbns_canonicos_novos = [il.to_isbn13(i) for i in isbns_novos]
    df_inserir['isbn13'] = isbns_canonicos_novos

    isbns_canonicos_existentes = [il.to_isbn13(i) for i in isbns_existentes]
    df_atualizar['isbn13'] = isbns_canonicos_existentes

    #
    # realiza as alterações pertinentes sobre o acervo unificado:
    #    - entradas novas são inseridas (concat)
    #    - entradas já existentes são atualizadas (merge e update)
    #
    df_inserir.reset_index(drop=True, inplace=True)
    df_atualizar.reset_index(drop=True, inplace=True)
    if args.v:
        print("Entradas atuais: ", len(df_unificado.index))
        print("Novas entradas: ", len(df_inserir.index))
        print("Entradas a atualizar: ", len(df_atualizar.index))

    tamanho_acervo = len(df_unificado.index)

    print("Antes da importação, acervo unificado contém %d entrada(s)." %
          tamanho_acervo)

    df_unificado = pd.concat([df_unificado, df_inserir], axis=0, sort=False)
    df_unificado.reset_index(drop=True, inplace=True)

    result = df_unificado[['isbn13']].merge(df_atualizar, how="left")
    df_unificado.update(result)
    df_unificado.reset_index(drop=True, inplace=True)

    tamanho_acervo = len(df_unificado.index)

    print("Após a importação, acervo unificado contém %d entrada(s)." %
          tamanho_acervo)

    # df_unificado.set_index('isbn13',inplace=True)
    df_unificado.to_json(JSON_ACERVO_UNIFICADO)
def main():
#Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-path', '--GCIS', help = "Insert url path to GCIS book in JSON format [ex.'https://gcis-search-stage.jpl.net:3000/book.json?all=1'] ")
    args = parser.parse_args()
    GCIS = args.GCIS

    if GCIS is None:
        GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
        print('NO MANUAL GCIS PATH\n ALL GCIS BOOK JSON FORMATS WILL BE USED AS DEFAULT')

    GCISPAR = parse(GCIS)
    for x in range(len(GCISPAR)):
        try:
        #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
        #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
            HREFPAR = parse(HREF)
        #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
        #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
        #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)

            print("GCIS-DEV\n\n\t", M, '\n\n\t', "isbn_original:", ISBNS, '\n\n\t', "isbn_mod:", V13, "\n\n")

        #DBpedia ISBN formats
            a = ISBNS
            b = canonical(CISBN)
            c = to_isbn10(CISBN)
            d = hyphenate(to_isbn10(CISBN))
            e = to_isbn13(CISBN)
            f = hyphenate(to_isbn13(CISBN))
            g = V13
            h = "ISBN {}" .format(CISBN)
            i = "ISBN {}" .format(canonical(CISBN))
            j = "ISBN {}" .format(hyphenate(to_isbn13(CISBN)))
            k = "ISBN {}" .format(V13)
            l = "ISBN {}" .format(to_isbn10(CISBN))
            m = "ISBN {}" .format(hyphenate(to_isbn10(CISBN)))

            tests = [a,b,c,d,e,f,g,h,i,j,k,l,m]

            for indie in tests:
                r = QUERY % indie
                RQUERY(r)
                if len(RQUERY(r)) != 0:
                    print(RQUERY(r))
                    break


        except:
            Error = '\n\t######## PROBLEM #######\n\tTitle:{}\n\tGCIS-ISBN:{}\n\tIdentifier:{}\n\n'.format(TITLE, ISBNS, IDEN)
            print(Error)
            file.write(Error)
for (root, dirs, files) in os.walk(book_dir):
    for f in files:
        with open(book_dir + f) as item:
            #print "hi"
            json_item = json.load(item)
            book_isbn = json_item['isbn']
            if book_isbn is not None:
                if book_isbn == "None":
                    with open("problem_book/" + str(f), 'w') as jsonFile:
                        jsonFile.write(
                            json.dumps(json_item,
                                       sort_keys=True,
                                       indent=4,
                                       separators=(',', ': ')))
                else:
                    book_isbn = clean(book_isbn)
                    #book_isbn = book_isbn.replace("-", "")
                    #book_isbn = EAN13(book_isbn)
                    if EAN13(book_isbn) != None:
                        book_isbn = EAN13(book_isbn)

                        json_item['isbn'] = book_isbn
                        print json_item['isbn']
                        with open("isbn13_book/" + str(f), 'w') as jsonFile:
                            jsonFile.write(
                                json.dumps(json_item,
                                           sort_keys=True,
                                           indent=4,
                                           separators=(',', ': ')))
                    else:
                        with open("non13_book/" + str(f), 'w') as jsonFile:
                for f in files:
                    with open(book_dir+f) as item:
                        json_item = json.load(item)
                        book_isbn = json_item['isbn']
                        if book_isbn is not None:

                            #No isbn
                            if book_isbn == "None": 
                                with open("%s%s"%(isbn_none_path,str(f)),'w') as jsonFile:
                                    jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',',': ')))
                                    #problem_log.write(json_item['identifier']+"\n")
                                    no_isbn.append(json_item['identifier'])
                                    problem_count = problem_count + 1
                            
                            else:
                                book_isbn = clean(book_isbn)
                                #ISBN-13
                                if EAN13(book_isbn) != None:
                                    book_isbn = EAN13(book_isbn)
                                    json_item['isbn'] = book_isbn
                                    with open("%s%s"%(isbn_13_path,str(f)), 'w') as jsonFile:
                                        jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': ')))
                                        normal_count = normal_count + 1
                                #ISBN-OTHER
                                else:
                                    book_isbn = book_isbn.replace("-", "")
                                    json_item['isbn'] = book_isbn
                                    with open("%s%s"%(isbn_other_path,str(f)), 'w') as jsonFile:
                                        jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': ')))
                                        other_isbn.append(json_item['identifier'])
                                        #problem_log.write(json_item['identifier']+"\n")
Beispiel #18
0
import sys

from shutil import copyfile

print('Bavatar eBook Renamer')

# Checking program invocation
if len(sys.argv) != 3:
    print("Commandline: python rename.py ISBN Filename")
else:
    # Recovering arguments
    isbn = sys.argv[1]
    book_filename = sys.argv[2]

    # Checking arguments
    isbn = isbnlib.to_isbn13(isbnlib.clean(isbn))
    if isbn is None:
        print("ISBN is not a valid code. Please check the argument")
    else:
        if not os.path.isfile(book_filename):
            print("Filename is not a valid file. Please check the argument")
        else:
            # Get ISBN info from code
            meta = isbnlib.meta(isbn)
            print(meta)

            # Format new filename
            filename, file_extension = os.path.splitext(book_filename)

            title = meta['Title']
            year = meta['Year']
def main():
#Commnd line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-log', '--login', help="Route path to Gcis.conf YAML file")
    parser.add_argument('-url', '--gcis', help='INSERT EITHER: https://data.globalchange.gov OR https://gcis-search-stage.jpl.net:3000')
    parser.add_argument('-name', '--username', help = "Insert GCIS username")
    parser.add_argument('-pw', '--apikey', help = "Insert GCIS username's api key")
    args = parser.parse_args()
    gcis = 'https://data.globalchange.gov'
    gcisdev = 'https://gcis-search-stage.jpl.net:3000'

#Extracts login info from Gcis.conf
    if args.login:
        a = open(args.login, "r")
        list = (yaml.load(a))
        diction = list[0]
        path = diction['url']
        user = diction['userinfo']
        key = diction['key']
        print(path+'\n'+user+'\n'+key)
    else:
        pass
    if args.gcis == gcis:
        print(args.gcis)
    elif args.gcis == gcisdev:
        print(args.gcis)
    else:
        print('NO MANUAL ENDPOINT (Ignore if using Config file)')
    if args.username:
        print(args.username)
    else:
        print('NO MANUAL USERNAME (Ignore if using Config file)')
    if args.apikey:
        print(args.apikey)
    else:
        print('NO MANUAL API KEY (Ignore if using Config file)')

#Credentials

        path = diction['url']
        if diction['url'] is None:
            path = args.gcis
        else:
            path = gcisdev

        user = diction['userinfo']
        if diction['userinfo'] is None:
            user = args.username

        key = diction['key']
        if diction['key'] is None:
            key = args.apikey


#Parses url.json#
    def parse(url):
        import requests
        r = requests.get(url, verify = False)
        JSONdict = r.json()
        return JSONdict
    GCIS = 'https://gcis-search-stage.jpl.net:3000/book.json?all=1'
    GCISPAR = parse(GCIS)

    for x in range(len(GCISPAR)):
    #Extracts book identifier from GCIS#
            IDEN = GCISPAR[x]["identifier"]
            match =  re.search(r'.*/(.*?)\..*?$', GCIS)
            if match:
                FILETYPE = match.groups()[0]
    #HREF = url that leads to book.json in GCIS-DEV
            HREF = 'https://gcis-search-stage.jpl.net:3000/{}/{}.json' .format(FILETYPE,IDEN)
    #HREF for either GCIS or GCIS-DEV
            #HREF = '{}//{}/{}.json' .format(path, FILETYPE, IDEN)
    #test
            #HREF = 'https://gcis-search-stage.jpl.net:3000/book/305e4144-39d2-4d84-8843-3f502ab890e0.json'
            HREFPAR = parse(HREF)
            print(HREFPAR)
    #Extracts book title and isbn from GCIS-DEV
            d = dict(HREFPAR)
            TITLE = d['title']
            ISBNS = d['isbn']
    #Cleans ISBNS to only conatian valid characters
            CISBN = clean(ISBNS)
    #V13 = validated canonical ISBN-13
            V13 = EAN13(CISBN)
            if V13 is None:
                V13 = canonical(CISBN)
            M = parse(HREF)
    #For possible future implementation of adding original isbn into the JSON dictionary.
            """M["isbn"] = V13
            M["org_isbn"] = ISBNS"""
            print(M, '\n\t', "isbn_original:", ISBNS)
    #Posts updated JSON dictionary back into GCIS-DEV using credentials from command line arguments.
            s = requests.Session()
            s.auth = ( user , key )
            s.headers.update({'Accept': 'application/json'})
            r = s.post(HREF, data = M , verify = False)
            r.raise_for_status()
            sys.exit()