def build_txt(self): from StringIO import StringIO from django.core.files.base import ContentFile from librarian import text out = StringIO() text.transform(open(self.xml_file.path), out) self.txt_file.save('%s.txt' % self.slug, ContentFile(out.getvalue())) self.save()
def for_book(cls, book, length=3): # count from this book only output = StringIO() f = open(book.xml_file.path) text.transform(f, output, False, ('raw-text',)) f.close() conts = {} last_word = '' for letter in output.getvalue().decode('utf-8').strip().lower(): mydict = conts.setdefault(last_word, {}) mydict.setdefault(letter, 0) mydict[letter] += 1 last_word = last_word[-length+1:] + letter # add children return reduce(cls.join_conts, (cls.get(child) for child in book.children.all()), conts)
def as_text(self, *args, **kwargs): from librarian import text return text.transform(self, *args, **kwargs)
""" from StringIO import StringIO from urllib2 import urlopen from zipfile import ZipFile from librarian.dcparser import BookInfo from librarian import text from lesmianator import Lesmianator XML_FILES = "http://www.wolnelektury.pl/media/packs/xml-all.zip" if __name__ == '__main__': poet = Lesmianator() xml_zip = ZipFile(StringIO(urlopen(XML_FILES).read())) for filename in xml_zip.namelist(): print filename info = BookInfo.from_file(xml_zip.open(filename)) if u'Wiersz' in info.genres: output = StringIO() text.transform(xml_zip.open(filename), output, False, ('raw-text',)) poet.add_text(output.getvalue()) poet.save()