Esempio n. 1
0
    def build_txt(self):
        from StringIO import StringIO
        from django.core.files.base import ContentFile
        from librarian import text

        out = StringIO()
        text.transform(open(self.xml_file.path), out)
        self.txt_file.save('%s.txt' % self.slug, ContentFile(out.getvalue()))
        self.save()
Esempio n. 2
0
 def for_book(cls, book, length=3):
     # count from this book only
     output = StringIO()
     f = open(book.xml_file.path)
     text.transform(f, output, False, ('raw-text',))
     f.close()
     conts = {}
     last_word = ''
     for letter in output.getvalue().decode('utf-8').strip().lower():
         mydict = conts.setdefault(last_word, {})
         mydict.setdefault(letter, 0)
         mydict[letter] += 1
         last_word = last_word[-length+1:] + letter
     # add children
     return reduce(cls.join_conts, 
                   (cls.get(child) for child in book.children.all()),
                   conts)
Esempio n. 3
0
 def as_text(self, *args, **kwargs):
     from librarian import text
     return text.transform(self, *args, **kwargs)
Esempio n. 4
0
 def as_text(self, *args, **kwargs):
     from librarian import text
     return text.transform(self, *args, **kwargs)
Esempio n. 5
0
"""

from StringIO import StringIO
from urllib2 import urlopen
from zipfile import ZipFile

from librarian.dcparser import BookInfo
from librarian import text

from lesmianator import Lesmianator


XML_FILES = "http://www.wolnelektury.pl/media/packs/xml-all.zip"


if __name__ == '__main__':
    poet = Lesmianator()

    xml_zip = ZipFile(StringIO(urlopen(XML_FILES).read()))
    for filename in xml_zip.namelist():
        print filename
        info = BookInfo.from_file(xml_zip.open(filename))

        if u'Wiersz' in info.genres:
            output = StringIO()
            text.transform(xml_zip.open(filename), output, False, ('raw-text',))
            poet.add_text(output.getvalue())

    poet.save()