예제 #1
0
    def execute(self):
        book = Book.objects.get(id=self.book_id)
        title = Retriever.cleanup_kilobytes(book.title)
        title = preprocess_title(title)

        new_authors, new_title = Retriever.extract_authors(title)
        if new_authors:
            title = new_title
            authors = [ Author.objects.get_or_create(name=author)[0] for author in new_authors]
            book.author = authors
        else:
            authors = book.author.all()

        title = preprocess_title(title)
        if book.title != title:
            print "%d %s : '%s' %s=> %s : '%s'" % ( book.id,
                                                " % ".join([a.name.encode('utf-8') for a in book.author.all() ] ),
                                                book.title.encode('utf-8'),
                                               "="*(3 if new_authors else 0),
                                                 " % ".join([a.name.encode('utf-8') for a in authors ] ),
                                                title.encode('utf-8'),
                                              )

        book.title = title
        book.credit = 1
        book.save()
        return True
예제 #2
0
def get_books_test():
    link = 'http://lib.ru/STRUGACKIE/'
    soup = download_soup(link)
    all_tags = Retriever.get_accept_books(soup,link)
#    for link, tag in all_tags:
#        print link, tag.encode('utf8')
        #print "'%s' -- %s" % ( link, '1')#tag.decode('utf8') )
    assert len(all_tags) == 99
    pass
예제 #3
0
def get_dirs_test():
    soup = download_soup('http://lib.ru/')
    all_tags = Retriever.get_accept_dirs(soup)
    for link,tag in all_tags:
        print link,tag.encode('utf8')
#    print
    keys = [tag[0] for tag in all_tags]
    keys.sort()
    #print len(keys)
    assert len(keys) == 64
예제 #4
0
def get_authors_title_test():
    import urllib
    l = 'http://lib.ru/TXT/ruscience.txt'
    page = urllib.urlopen(l+'_Ascii.txt')
    text = page.read(2048)
    ud = UniversalDetector()
    ud.feed(text)
    ud.close()
    encoding = ud.result['encoding']
    text = unicode(text, encoding)
    authors, title = Retriever.get_authors_and_title(text)
    assert len(authors) == 1
    assert authors[0] == u'Дмитрий Толмацкий'
    assert title == u'Российская наука на пути из реанимации в морг'
#    print 'authors', ",".join( [author.encode('utf8') for author in authors ] )
#    print 'title',title
    pass