def execute(self): book = Book.objects.get(id=self.book_id) title = Retriever.cleanup_kilobytes(book.title) title = preprocess_title(title) new_authors, new_title = Retriever.extract_authors(title) if new_authors: title = new_title authors = [ Author.objects.get_or_create(name=author)[0] for author in new_authors] book.author = authors else: authors = book.author.all() title = preprocess_title(title) if book.title != title: print "%d %s : '%s' %s=> %s : '%s'" % ( book.id, " % ".join([a.name.encode('utf-8') for a in book.author.all() ] ), book.title.encode('utf-8'), "="*(3 if new_authors else 0), " % ".join([a.name.encode('utf-8') for a in authors ] ), title.encode('utf-8'), ) book.title = title book.credit = 1 book.save() return True
def get_books_test(): link = 'http://lib.ru/STRUGACKIE/' soup = download_soup(link) all_tags = Retriever.get_accept_books(soup,link) # for link, tag in all_tags: # print link, tag.encode('utf8') #print "'%s' -- %s" % ( link, '1')#tag.decode('utf8') ) assert len(all_tags) == 99 pass
def get_dirs_test(): soup = download_soup('http://lib.ru/') all_tags = Retriever.get_accept_dirs(soup) for link,tag in all_tags: print link,tag.encode('utf8') # print keys = [tag[0] for tag in all_tags] keys.sort() #print len(keys) assert len(keys) == 64
def get_authors_title_test(): import urllib l = 'http://lib.ru/TXT/ruscience.txt' page = urllib.urlopen(l+'_Ascii.txt') text = page.read(2048) ud = UniversalDetector() ud.feed(text) ud.close() encoding = ud.result['encoding'] text = unicode(text, encoding) authors, title = Retriever.get_authors_and_title(text) assert len(authors) == 1 assert authors[0] == u'Дмитрий Толмацкий' assert title == u'Российская наука на пути из реанимации в морг' # print 'authors', ",".join( [author.encode('utf8') for author in authors ] ) # print 'title',title pass