Ejemplo n.º 1
0
 def _validate_book_id(self):
     # TODO: add check constraints
     # Validate book identifier
     if self.book_id_type == 'ISBN':
         if pyisbn.validate(self.book_id):
             if len(self.book_id) == 10:
                 self.isbn10 = self.book_id
                 self.isbn13 = pyisbn.convert(self.book_id)
             else:
                 self.isbn13 = self.book_id
                 self.isbn10 = pyisbn.convert(self.book_id)
         else:
             raise ValidationError(
                 {'book_id': '{} is an invalid ISBN'.format(self.book_id)})
     elif self.book_id_type == 'ASIN':
         regex = r"[A-Z0-9]{10}"
         # Remove whitespaces
         self.book_id = self.book_id.strip()
         if len(self.book_id) == 10 and re.fullmatch(regex, self.book_id):
             self.asin = self.book_id
         else:
             raise ValidationError(
                 {'book_id': '{} is an invalid ASIN'.format(self.book_id)})
     else:
         raise ValidationError({
             'book_id_type':
             'Allowed Book Id types: {}'.format(self.allowed_book_id_types)
         })
Ejemplo n.º 2
0
def combine_title():
    import pyisbn
    with open('isbn2title4.pickle', 'rb') as f:
        data = pickle.load(f)
    print(len(data))
    pop_key = []
    for isbn in data:
        if data[isbn] == '':
            pop_key.append(isbn)
    for key in pop_key:
        data.pop(key, None)
    print(len(data))
    with open('isbn2title2.pickle', 'rb') as f:
        data2 = pickle.load(f)
    print(len(data))
    for isbn in data2:
        orig_isbn = isbn
        if not isinstance(isbn, str):
            isbn = int(isbn)
            isbn = str(isbn)
        if len(isbn) == 10 or len(isbn) == 13:
            if len(str(isbn)) != 13:
                try:
                    isbn13 = pyisbn.convert(isbn)
                except:
                    continue
            else:
                isbn13 = isbn
        else:
            continue
        if isbn13 not in data and len(str(data2[orig_isbn])) != 0:
            data[isbn13] = data2[orig_isbn]
    print(len(data))
    with open('isbn2title4.pickle', 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 3
0
def parse_isbn(isbn):
    '''生成 10 位和 13 位 isbn 码。
    如果 isbn 码转换失败,直接返回原来的输入

    :param isbn: 原始的 isbn
    '''
    try:
        isbn_a = pyisbn.convert(str(isbn))
        isbn_b = pyisbn.convert(isbn_a)

        result = {}
        result['isbn%d' % len(isbn_a)] = isbn_a
        result['isbn%d' % len(isbn_b)] = isbn_b
    except pyisbn.IsbnError:
        result = {'isbn10': isbn, 'isbn13': isbn}

    return result
Ejemplo n.º 4
0
 def __init__(self, line=None, period=None, metric=None):
     super(CounterBook, self).__init__(line, period, metric)
     if line is not None:
         self.isbn = line[3].strip().replace('-', '')
         if len(self.isbn) == 10:
             self.isbn = pyisbn.convert(self.isbn)
         self.issn = line[4].strip()
         self.eissn = None
Ejemplo n.º 5
0
 def __init__(self, *args, **kwargs):
     # automatically generate key_name
     isbn10 = kwargs.get("isbn10")
     isbn13 = kwargs.get("isbn13", (pyisbn.convert(isbn10) if isbn10 else None))
     if isbn13:
         kwargs["key_name"] = "book:%s" % isbn13
     super(Book, self).__init__(*args, **kwargs)
     self.links = LinkAttribute(self)
Ejemplo n.º 6
0
 def __init__(self, line=None, period=None, metric=None):
     super(CounterBook, self).__init__(line, period, metric)
     if line is not None:
         self.isbn = line[3].strip().replace('-', '')
         if len(self.isbn) == 10:
             self.isbn = pyisbn.convert(self.isbn)
         self.issn = line[4].strip()
         self.eissn = None
Ejemplo n.º 7
0
def normalize(element, version='13'):
    element = element.replace('-', '')
    if version == '13':
        if len(element) == 10:
            return pyisbn.convert(element)
        elif len(element) == 13:
            return pyisbn.convert(pyisbn.convert(element))
        else:
            raise ValueError('ISBN is neither 10 or 13 chars long')
    elif version == '10':
        if len(element) == 10:
            return pyisbn.convert(pyisbn.convert(element))
        elif len(element) == 13:
            return pyisbn.Isbn13(element).convert()
        else:
            raise ValueError('ISBN is neither 10 or 13 chars long')
    else:
        raise ValueError('version can only be one of 10 or 13')
Ejemplo n.º 8
0
def parse_isbn(raw):
    '''将 isbn 转换成 10 / 13 位以及带 hyphen 形式'''

    a, b = raw, raw
    isbn = {
        'isbn10': raw,
        'isbn13': raw,
        'isbn10-hyphen': raw,
        'isbn13-hyphen': raw
    }

    with ignores(pyisbn.IsbnError):
        a = pyisbn.convert(raw)
        b = pyisbn.convert(a)
        isbn = {'isbn%d' % len(i): i for i in [a, b]}

    with ignores(isbn_hyphenate.IsbnMalformedError):
        isbn['isbn10-hyphen'] = isbn_hyphenate.hyphenate(isbn['isbn10'])
        isbn['isbn13-hyphen'] = isbn_hyphenate.hyphenate(isbn['isbn13'])

    return isbn
Ejemplo n.º 9
0
    def get_queryset(self):
        query = self.request.query_params.get('query', None)

        if not query:
            return []

        results = []

        # by by id
        try:
            _id = int(query)
            results += Book.objects.filter(id=_id)
        except:
            pass

        # by title
        results += Book.objects.filter(title__icontains=query)

        # by publisher name
        for publisher in Publisher.objects.filter(name__icontains=query):
            results += publisher.books.all()

        # by creator name
        for creator in Creator.objects.filter(name__icontains=query):
            results += creator.books.all()

        # by subject
        for subject in Subject.objects.filter(name__icontains=query):
            results += subject.books.all()

        # by isbn
        try:
            # clean the query isbn first
            _isbn = query.replace('-', '')
            if len(_isbn) == 10:  # convert to isbn 13
                _isbn = pyisbn.convert(_isbn)

            results += Book.objects.filter(isbn_clean__icontains=_isbn)
        except:
            pass

        # by lang
        results += Book.objects.filter(lang__icontains=query)

        # by doe
        results += Book.objects.filter(doe__icontains=query)

        # by place
        results += Book.objects.filter(place__icontains=query)

        return results
Ejemplo n.º 10
0
 def to_python(self, value):
     value = value.replace("-", "")
     if len(value) == 13:
         if Isbn(value).validate():
             return value
         else:
             raise ValidationError("ISBN did not validate")
     elif (len(value)) == 10:
         if Isbn(value).validate():
             return pyisbn.convert(value)
         else:
             raise ValidationError("ISBN did not validate")
     else:
         raise ValidationError("ISBN has to be either 10 or 13 digits long")
Ejemplo n.º 11
0
def search(request):
    results = {
        'sources': {},
        'error': {},
    }

    '''
    more: false,
    results: [
        { text: "Western", children: [
            { id: "CA", text: "California" },
            { id: "AZ", text: "Arizona" }
        ] },
        { text: "Eastern", children: [
            { id: "FL", text: "Florida" }
        ] }
    ]
    '''

    title = request.QUERY_PARAMS.get('title', False)
    if title:
        sources = Source.objects.filter(title__icontains=title)
        source_serializer = SourceSerializer(sources, many=True)
        results['sources']['spuqi'] = source_serializer.data

    isbn_str = request.QUERY_PARAMS.get('isbn', False)
    if isbn_str:
        try:
            pyisbn.Isbn(isbn_str)
        except pyisbn.IsbnError:
            results['error'] = {
                'message': _('ISBN number must contain only digit-numbers'),
            }
        else:
            isbn_number = pyisbn.convert(isbn_str)
            if not pyisbn.validate(isbn_number):
                results['error'] = {
                    'message': _('A valid ISBN number is required')
                }
            #try:
            # if googlebooks_api.list('isbn:%s' % isbn)['totalItems'] > 0:
            #     results['sources']['googlebooks'] = googlebooks_api.list(
            #         'isbn:%s' % isbn)['items']
            results['sources']['googlebooks'] = googlebooks_api.list(
                'isbn:%s' % isbn_number)
            # except ConnectionError:
            #     results.errors.append(_('Could not connect to Google Book API'))

    return Response({"results": results})
Ejemplo n.º 12
0
def parse_isbns(s):
    import pyisbn
    """ Given a string, find as many uniq ISBNs in it an return them. """
    pattern = re.compile('[0-9X-]{10,25}')
    isbns = set()
    for candidate in pattern.findall(s):
        candidate = candidate.replace('-', '').replace(' ', '')
        if len(candidate) == 10:
            try:
                isbns.add(pyisbn.convert(candidate))
            except pyisbn.IsbnError as err:
                logger.error('%s: %s' % (s, err))
        elif len(candidate) == 13:
            isbns.add(candidate)
    return list(isbns)
Ejemplo n.º 13
0
def parse_isbns(s):
    import pyisbn
    """ Given a string, find as many uniq ISBNs in it an return them. """
    pattern = re.compile('[0-9X-]{10,25}')
    isbns = set()
    for candidate in pattern.findall(s):
        candidate = candidate.replace('-', '').replace(' ', '')
        if len(candidate) == 10:
            try:
                isbns.add(pyisbn.convert(candidate))
            except pyisbn.IsbnError as err:
                logger.error('%s: %s' % (s, err))
        elif len(candidate) == 13:
            isbns.add(candidate)
    return list(isbns)
Ejemplo n.º 14
0
    def __init__(self, line=None, period=None, metric=None, month_data=None,
                 title="", platform="", publisher="", isbn=None, issn=None):
        super(CounterBook, self).__init__(line, period, metric, month_data,
                                          title, platform, publisher)
        self.eissn = None
        if line is not None:
            self.isbn = line[3].strip().replace('-', '')
            if len(self.isbn) == 10:
                self.isbn = pyisbn.convert(self.isbn)
            self.issn = line[4].strip()

        if isbn is not None:
            self.isbn = isbn

        if issn is not None:
            self.issn = issn
Ejemplo n.º 15
0
def combine_ratings():
    import pyisbn
    with open('isbn2rating4.pickle', 'rb') as f:
        data = pickle.load(f)
    # not_valid = 0
    # not_in_region = 0
    # for isbn in data:
    #   try:
    #     if not pyisbn.validate(isbn):
    #       not_valid += 1
    #   except:
    #     not_in_region += 1
    # print(not_valid/len(data))
    # print(not_in_region/len(data))
    print(len(data))
    pop_key = []
    for isbn in data:
        if data[isbn] == '':
            pop_key.append(isbn)
    for key in pop_key:
        data.pop(key, None)
    for isbn in data:
        data[isbn] = float(data[isbn])
    print(len(data))
    with open('isbn2rating3.pickle', 'rb') as f:
        data2 = pickle.load(f)
    print(len(data))
    for isbn in data2:
        orig_isbn = isbn
        if not isinstance(isbn, str):
            isbn = int(isbn)
            isbn = str(isbn)
        if len(isbn) == 10 or len(isbn) == 13:
            if len(str(isbn)) != 13:
                try:
                    isbn13 = pyisbn.convert(isbn)
                except:
                    continue
            else:
                isbn13 = isbn
        else:
            continue
        if isbn13 not in data and len(str(data2[orig_isbn])) != 0:
            data[isbn13] = float(data2[orig_isbn])
    print(len(data))
    with open('isbn2rating4.pickle', 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 16
0
    def get_by_isbn(cls, isbn, create=True, defaults=None):
        if len(isbn) == 10:
            # NOTE: always use ISBN-13
            # this should also raise an exception if isbn is invalid
            isbn = pyisbn.convert(isbn)

        if not pyisbn.validate(isbn):
            raise ValueError("Invalid ISBN: %s" % isbn)

        memkey = "book:%s" % isbn
        book = memcache.get(memkey)
        if book:
            return book

        if not create:
            book = Book.get_by_key_name(memkey)
        else:
            defaults = defaults or {}
            defaults["isbn13"] = isbn
            book = Book.get_or_insert(memkey, **defaults)

        if book:
            memcache.set(book.memkey, book, Book.MEMCACHE_TTL)
        return book
Ejemplo n.º 17
0
def convert(isbn):
    if len(isbn) != 13:
        return pyisbn.convert(isbn)
    return isbn
isbns_per_rule = 100

# DO NOT change the value of any variables below this line!
start_string = "t.ISSN IN ('"
join_string = "','"
end_string = "')\n"

isbns = set()
rules = []

isbnfile = open(isbn_filename, 'r')
for line in isbnfile:
    isbn = line.strip()
    isbns.add(isbn)
    if len(isbn) == 10:
        isbns.add(pyisbn.convert(isbn))

isbnfile.close()

temp_list = []

for isbn in isbns:
    temp_list.append(isbn)

    # batch isbns into strings
    if len(temp_list) >= isbns_per_rule:
        rules.append(start_string + join_string.join(temp_list) + end_string)
        temp_list = []

# batch any remainder into a final string
if temp_list:
Ejemplo n.º 19
0
def test_convert_invalid():
    with raises(IsbnError, match='Only ISBN-13s with 978 Bookland code can be '
                                 'converted to ISBN-10.'):
        convert('9790000000001')
Ejemplo n.º 20
0
        title = line.strip().replace(isbn, '').strip()
        testisbns.append(isbn)
    print "Success!"

    isbns = []
    for testisbn in testisbns:
        isbns.append(str(testisbn.strip().replace('-', '')))
    print len(isbns)

    urlstup = []
    for isbn in isbns:
        isbnraw = isbn[:]
        if len(isbn) == 10: pass
        elif len(isbn) == 13:
            try:
                isbn = pyisbn.convert(isbn)
            except pyisbn.IsbnError as e:
                print "Invalid ISBN: ", isbn
                continue
        else:
            print "Length of ISBN not 13 or 10: ", isbn
            continue
        isbn_title_author = (isbnraw, isbn, title, author)
        urlstup.append(fullurl)
    print len(urls)

    urls = []
    for isbn in isbns:
        if len(isbn) == 10: pass
        elif len(isbn) == 13:
            try:
Ejemplo n.º 21
0
            soup = BeautifulSoup(review, 'html.parser')
            review = ''.join(soup.findAll(text=True))
            review = review.strip()
            if review == "": continue
            else: pass
            reviews_parsed.append(review)
        reviews_tup.append(reviews_parsed)
        reviews_dict = dict()
        reviews_dict["reviews"] = reviews_parsed
        reviews_dict["isbn"] = row[0]
        reviews_dict["avg_rating"] = float(row[2].split()[0])
        reviews_dicts.append(reviews_dict)

        reviews_dict = dict()
        reviews_dict["reviews"] = reviews_parsed
        reviews_dict["isbn"] = pyisbn.convert(row[0])
        reviews_dict["avg_rating"] = float(row[2].split()[0])
        reviews_dicts.append(reviews_dict)

print "total rows read {}".format(len(results1))
print "len reviews_tup {}".format(len(reviews_tup))
print "time took to process: {}".format(time.time() - start)

books = dict()
books["books"] = reviews_dicts
print type(books)

with open('isbn_avgrating_reviews.json', 'w') as f:
    for chunk in json.JSONEncoder().iterencode(books):
        f.write(chunk)
Ejemplo n.º 22
0
 def isbn_10(self):
     if len(self.isbn) == 10:
         return self.isbn
     else:
         return pyisbn.convert(self.isbn)
Ejemplo n.º 23
0
    def find_books(self, direct_search_only=False):
        """
        Lookup book details on google books.

        Utilising book info web services to look up related isbn or title.

        :ISBN lookup:
        First lookup the ISBN on google books, if that yielded no
        results, the related ISBN are looked up on xISBN and librarything
        and the result is looked up on google books. If no related
        ISBNs found on xISBN or LibraryThing, the title is looked up
        on isbndb and isbnplus, and lookup the book on google books using
        the yielded title. If none of these steps.

        :Title lookup:
        If provided, books will be also looked up on google books by title.

        """
        found_books = []
        lookup_results = []
        errors = []

        found_books += self.lookup_by_title()

        if bool(self.isbns):
            for isbn in self.isbns:
                try:
                    isbn = isbn.strip()

                    # first search on google books
                    lookup_results += self.search_google_books(isbn, None)

                    # if not on google, search amazon
                    if not lookup_results:
                        lookup_results += self.search_amazon(isbn)

                    # if dirct_search_only is not set, then try related
                    # and title search
                    if not direct_search_only and not lookup_results:
                        lookup_results += self.lookup_by_related_isbn(isbn)

                        if not lookup_results:
                            lookup_results = self.lookup_by_title(
                                self.lookup_title(isbn))

                    # ensure that all results have the targeted isbn
                    # pylint: disable=C0200
                    for i in range(len(lookup_results)):
                        if len(isbn) == 13:
                            lookup_results[i].isbn13 = isbn
                            lookup_results[i].isbn10 = pyisbn.convert(isbn)
                        else:
                            lookup_results[i].isbn10 = isbn
                            lookup_results[i].isbn13 = pyisbn.convert(isbn)

                    found_books += lookup_results
                except Exception as err:
                    errors.append(
                        time.strftime('%Y-%m-%d_%H:%M:%S - ') + str(err))
                    continue

        found_books = sorted(set(found_books), key=lambda b: b.title)
        if errors:
            flash(errors)
        return found_books
Ejemplo n.º 24
0
    def convert_entry(self, entry):
        ISBN, TITLE, AUTHOR, FORMAT = 'ISBN', 'Title', 'Author', 'Format'

        NUM_OF_PAGES, PRIVATE_NOTE = 'Number of pages', 'Private Note'
        PUBLISHER, PUB_DATE = 'Publisher', 'Publication date'

        COMMENT_TITLE, COMMENT_CONTENT = 'Comment title', 'Comment content'
        STATUS, STARS = 'Status', 'Stars'

        PRIORITY = 'Priority'

        TAGS = 'Tags'

        ISBN = self.headers[ISBN]
        TITLE = self.headers[TITLE]
        AUTHOR = self.headers[AUTHOR]
        FORMAT = self.headers[FORMAT]
        NUM_OF_PAGES = self.headers[NUM_OF_PAGES]
        PUBLISHER = self.headers[PUBLISHER]
        PUB_DATE = self.headers[PUB_DATE]
        PRIVATE_NOTE = self.headers[PRIVATE_NOTE]
        COMMENT_TITLE = self.headers[COMMENT_TITLE]
        COMMENT_CONTENT = self.headers[COMMENT_CONTENT]
        STATUS = self.headers[STATUS]
        STARS = self.headers[STARS]
        PRIORITY = self.headers[PRIORITY]
        TAGS = self.headers[TAGS]

        title = entry.get(TITLE)

        author, additional_authors = None, None
        if AUTHOR in entry:
            all_authors = list(map(str.strip, entry[AUTHOR].split(',')))
            if len(all_authors) > 0:
                author = all_authors[0]
            if len(all_authors) > 1:
                additional_authors = ', '.join(all_authors[1:])

        isbn13 = entry.get(ISBN)
        isbn10 = None
        if isbn13:
            isbn13 = isbn13[1:-1]
            try:
                isbn10 = pyisbn.convert(isbn13)

                if len(isbn13) == 10 and len(isbn10) == 13:
                    isbn13, isbn10 = isbn10, isbn13
            except pyisbn.IsbnError:
                # ignore inconvertible ISBNs
                pass

        publisher = entry.get(PUBLISHER)
        binding = entry.get(FORMAT)
        num_of_pages = entry.get(NUM_OF_PAGES)

        year_published = entry.get(PUB_DATE)
        if year_published:
            year_published = year_published[1:-1].replace('-', '/')

        private_notes = self._convert_linebreak(entry.get(PRIVATE_NOTE))

        # wishlist
        if PRIORITY in entry:
            bookshelves = ['to-read']
            my_rating = my_review = date_read = date_added = None
        # bookshelve
        else:
            my_rating = entry.get(STARS)
            my_review = self._convert_comment(
                entry.get(COMMENT_TITLE), entry.get(COMMENT_CONTENT))

            tags = entry.get(TAGS)
            status = entry.get(STATUS)
            date_read, date_added, bookshelves = self._convert_status(status,
                                                                      tags)

            if len(bookshelves) == 0:
                logging.warning('cannot parse %s: %s', title, status)

        if self.only_isbn:
            title = ''
            author = ''
            additional_authors = ''
            publisher = ''
            binding = ''
            num_of_pages = ''
            year_published = ''

        return (title, author, additional_authors, isbn10, isbn13, my_rating,
                publisher, binding, num_of_pages, year_published, date_read,
                date_added, ','.join(bookshelves), my_review, private_notes)
Ejemplo n.º 25
0
def test_convert(isbn):
    assert convert(convert(isbn)) == isbn
Ejemplo n.º 26
0
def test_convert(isbn):
    expect(convert(convert(isbn))) == isbn.replace('-', '')
Ejemplo n.º 27
0
    def post(request):
        """
        receives a json file and creates book objects
        :param request:
        :return:
        """

        json_file = request.FILES.get('json_file', None)

        if not json_file:
            return Response({"error": "No files"},
                            status=status.HTTP_400_BAD_REQUEST)

        # open the json file and decode as uft-8
        f = open(json_file.temporary_file_path(), 'r', encoding='utf-8')

        # convert json to python dictionary
        books = json.loads(f.read())

        # used to show progress
        counter = 0

        for book in books:
            # show progress
            if counter % 10000 == 0:
                print(counter, len(books))

            try:

                try:  # if the book exists dont' go through converting json
                    Book.objects.get(id=book['book_id'])
                except Book.DoesNotExist:
                    print("new book")
                    title = book['title']
                    book_id = book['book_id']
                    isbn = book['isbn']
                    image = book['image_link']
                    pdf = book['pdf_link']
                    page_count = book['pages']
                    edition = book['edition']
                    count = book['count']
                    lang = book['lang']
                    doe = book['doe']
                    place = book['place']
                    issue_date_str = book['issue_date']
                    volume = book['volume']

                    # clean isbn does not have '-' and is converted to isbn 13
                    isbn_clean = book['isbn'].replace('-', '')
                    if len(isbn_clean == 10):
                        try:
                            isbn_clean = pyisbn.convert(isbn_clean)
                        except:
                            pass

                    try:  # issue date might be blank or in wrong format
                        date = issue_date_str.split('/')
                        date[0] = '13' + date[0]

                        issue_date = jdatetime.date(int(date[0]), int(date[1]),
                                                    int(date[2]))
                    except:  # if no valid issue_date found, set that to None
                        issue_date = None

                    try:  # price might be blank or in the wrong format
                        price = int(book['price'])
                    except:  # set None if no valid price
                        price = None

                    if book['publisher']:  # if book has publisher available
                        try:
                            # if publisher already in database

                            publisher = Publisher.objects.get(
                                id=book['publisher']['id'])

                        except Publisher.DoesNotExist:

                            # if publisher not in database, create it
                            publisher = Publisher.objects.create(
                                id=book['publisher']['id'],
                                name=book['publisher']['name'])
                    else:  # if no publisher available set it to None
                        publisher = None

                    # construct the book object
                    the_book = Book(title=title,
                                    publisher=publisher,
                                    id=book_id,
                                    isbn=isbn,
                                    issue_date=issue_date,
                                    price=price,
                                    image=image,
                                    pdf=pdf,
                                    page_count=page_count,
                                    edition=edition,
                                    count=count,
                                    lang=lang,
                                    place=place,
                                    doe=doe,
                                    volume=volume,
                                    isbn_clean=isbn_clean)

                    the_book.save()

                    try:  # subjects may not be present

                        for subject in book['subjects']:
                            # get to create subject objects

                            try:
                                subject = Subject.objects.get(id=subject['id'])
                            except Subject.DoesNotExist:
                                subject = Subject.objects.create(
                                    id=subject['id'], name=subject['title'])

                            the_book.subjects.add(subject)
                    except:
                        pass

                    try:  # creators may not be present
                        for creator in book['authors']:
                            # get or create, creators

                            try:
                                creator = Creator.objects.get(id=creator['id'])
                            except Creator.DoesNotExist:
                                creator = Creator.objects.create(
                                    id=creator['id'],
                                    name=creator['name'],
                                    type=creator['type'])
                            the_book.creators.add(creator)
                    except:
                        pass

                    # save the book object into database
                    the_book.save()
            except:
                # if any uncut exception occur raise it with the book id that caused it
                raise Exception('book id: ' + book['book_id'])

            counter += 1

        return Response({"status": "done"})
Ejemplo n.º 28
0
def scrape(name):
    #page = open(name)
    page = open('page.txt')
    soup = BeautifulSoup(page)
    table = soup.find('table')
    rows = table.findAll('tr')
    length = len(rows)
    #for each class
    finallist = []
    first = True
    previous = []
    p = 1329
    for i in range(1329, length):
        print p
        p += 1
        currentrow = {}
        thisrow = rows[i]
        columns = thisrow.findAll('td')
        
        #make list of course designations i.e. MAE 305
        classes = columns[1].find('u')
        classes = classes.findAll(text = quote)
        course_desig = []
        #print classes
        for c in classes:
            c = re.sub(' +', '', c)
            c = re.sub('\n', '', c)
            if (str(c) != '<br/>'):
                if str(c) != "":
                    course_desig.append(c)
        if course_desig == previous:
            continue
        else:
            previous = course_desig
        currentrow['coursedesig'] = (course_desig)
        #course name
        name = columns[2].contents[0]
        name = name.encode('utf-8').strip()
        currentrow['coursename'] = (name)
        
        #get page url for reading lists
        pageurl = columns[11].find('a')['href']
        #print pageurl
        text = get_books_page(pageurl)
        
        #get the required books information for this class
        thiscoursesbooks = []
        required = text.find(id = 'requiredList')
        if required != None:
            required = required.findAll(class_='viewReading')
            for g in required:
                thisbook = {}
                title = g.find(text = "Title: ")
                if title != None:
                    title = title.parent.findNext('td')
                    if title != None:
                        title = title.text
                else:
                    continue
                thisbook['title'] = title
                author = g.find(text = "Author: ")
                if author != None:
                    author = author.parent.findNext('td')
                    if author != None:
                        author = author.text
                #print author
                thisbook['author'] = author
                isbn10 = g.find(text= "ISBN: ")
                if isbn10 != None:
                    isbn10 = isbn10.parent.findNext('td')
                    if isbn10 != None:
                        isbn10 = isbn10.text
                #print isbn10
                if isbn10.isdigit():
                    isbn13 = pyisbn.convert(isbn10)
                else:
                    continue
                thisbook['isbn10'] = isbn10
                thisbook['isbn13'] = isbn13
                labprice = get_labyrinth_price(g)
                thisbook['labprice'] = labprice
                thisbook['amazonprice']=(get_amazon_price(isbn13))
                thisbook['image'] = (get_amazon_image())
                thisbook['edition'] = (get_amazon_edition())
                thisbook['required'] = True
                thiscoursesbooks.append(thisbook)
        recommended = text.find(id = 'recommendedListContainer')
        if recommended != None:
            recommended = recommended.findAll(class_='viewReading')
            for g in recommended:
                thisbook = {}
                title = g.find(text = "Title: ")
                if title != None:
                    title = title.parent.findNext('td')
                    if title != None:
                        title = title.text
                else:
                    continue
                thisbook['title'] = title
                author = g.find(text = "Author: ")
                if author != None:
                    author = author.parent.findNext('td')
                    if author != None:
                        author = author.text
                thisbook['author'] = author
                isbn10 = g.find(text= "ISBN: ")
                if isbn10 != None:
                    isbn10 = isbn10.parent.findNext('td')
                    if isbn10 != None:
                        isbn10 = isbn10.text
                if isbn10.isdigit():
                    isbn13 = pyisbn.convert(isbn10)
                else:
                    continue
                thisbook['isbn10'] = isbn10
                thisbook['isbn13'] = isbn13
                labprice = get_labyrinth_price(g)
                thisbook['labprice'] = labprice
                thisbook['amazonprice']=(get_amazon_price(isbn13))
                thisbook['image'] = (get_amazon_image())
                thisbook['edition'] = (get_amazon_edition())
                thisbook['required'] = False
                thiscoursesbooks.append(thisbook)            
        currentrow['booklist'] = (thiscoursesbooks)
        if (first == True):
            f = open('text5.txt', 'r+')
            f.write( "[\n")
            first = False
        f.write(str(currentrow)+',\n')
        print currentrow
        finallist.append(currentrow)
    f.write( "]")
Ejemplo n.º 29
0
def update_to_goodreads(entries, cookies, disk_cache, limit, wait):
    """Update book entries to Goodreads.

    :param entries: list of books
    :param cookies: login cookie for Goodreads
    :param disk_cache: cache of updated books
    """

    session = requests.Session()

    success = []
    error = []

    for entry in entries:
        isbn13 = entry['isbn13']

        isbns = [isbn13]
        try:
            isbn10 = pyisbn.convert(isbn13)
            isbns.append(isbn10)
        except Exception:
            pass

        resp = check_exists(session, (isbn10, isbn13), cookies)
        if not resp:
            logging.warning('{} couldn\'t be found'.format(repr_book(entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        url = get_edit_url(resp)
        if not url:
            logging.warning('{}\' url is not found'.format(repr_book(entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        submit_url, form_data = get_form_data(session, cookies, url)
        if not form_data:
            logging.warning('{}\' form data is not found'.format(repr_book(
                entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        # Do not cause any updates
        form_data['review[cog_explicit]'] = '0'
        for key in ('add_to_blog', 'add_update'):
            if key in form_data:
                form_data[key] = '0'

        # sanity check
        if len([key for key in form_data if 'readingSessionDatePicker' in key
                ]) != 10:
            logging.warning('{}\' date is problematic'.format(repr_book(
                entry)))
            logging.warning(form_data)
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            continue

        if update_book(entry, form_data, submit_url, session, cookies):
            success.append(entry)
            disk_cache[entry['isbn13']] = ''
        else:
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'

        if limit is not None and len(success) >= limit:
            break

        random_wait()

    return success, error
Ejemplo n.º 30
0
def test_convert_invalid():
    with expect.raises(IsbnError,
                       'Only ISBN-13s with 978 Bookland code can be converted '
                       'to ISBN-10.'):
        convert('0000000000000')
Ejemplo n.º 31
0
 def load(self,
          directory,
          index="crossref",
          doc_type="crossref",
          bulk_size=100000):
     """
     Load data for a file into Es-Index
     :param directory: The path for the directory with the data
     :param index: Name of the index
     :param doc_type: Name of the doc_type
     :param bulk_size: The bulksize for committing the data into the es index
     :return:
     """
     cache = list()
     counter = 0
     # total = len([name for name in os.listdir(directory) if os.path.join(directory, name) and name.endswith('json.xz')])
     # with tqdm(total=total) as pbar_o:
     for root, dir_names, file_names in os.walk(directory):
         for filename in file_names:
             if filename.endswith('json.xz'):
                 print("OPEN FILE")
                 # pbar_o.update()
                 with lzma.open(os.path.join(root, filename),
                                'rt',
                                encoding='utf-8') as f:
                     line = f.readline()
                     while line:
                         json_object = json.loads(line)
                         # json_object['oid'] = json_object['_id']['$oid']
                         # del json_object['_id']
                         doc_id = json_object['DOI']
                         # convert isbn numbers
                         if 'ISBN' in json_object.keys():
                             isbn_list = list()
                             for isbn in json_object['ISBN']:
                                 if len(isbn.replace('-', '')) == 10:
                                     isbn_list.append(pyisbn.convert(isbn))
                             json_object['ISBN'] = isbn_list
                         json_object = self.remove_affiliation(
                             json_object, 'author')
                         json_object = self.remove_affiliation(
                             json_object, 'editor')
                         json_object = self.remove_unused_fields(
                             json_object)
                         data = dict()
                         data['_op_type'] = 'index'
                         data['_index'] = index
                         data['_type'] = doc_type
                         data['_id'] = doc_id
                         data['_source'] = json_object
                         cache.append(data)
                         counter += 1
                         if counter >= bulk_size:
                             self.batch(cache)
                             cache = []
                             counter = 0
                         line = f.readline()
                 # after each file write down what keys were deleted.
                 with open('logging/deleted_keys.txt',
                           'w',
                           encoding='utf-8') as file:
                     for key in self.deleted_key:
                         file.write(key + '\n')