Example #1
0
def test_transform():
    temp = NamedTemporaryFile(delete=False)
    temp.close()
    WLDocument.from_file(
            get_fixture('text', 'asnyk_zbior.xml'),
            provider=DirDocProvider(get_fixture('text', ''))
        ).as_pdf(save_tex=temp.name)
    tex = open(temp.name).read().decode('utf-8')
    print tex

    # Check contributor list.
    editors = re.search(ur'\\def\\editors\{Opracowanie redakcyjne i przypisy: ([^}]*?)\.\s*\}', tex)
    assert_equal(editors.group(1), u"Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska")
Example #2
0
    def wldocument(self, parse_dublincore=True):
        from catalogue.import_utils import ORMDocProvider
        from librarian.parser import WLDocument

        return WLDocument.from_file(self.xml_file.path,
                provider=ORMDocProvider(self),
                parse_dublincore=parse_dublincore)
Example #3
0
def load_including_children(wldoc=None, provider=None, uri=None):
    """ Makes one big xml file with children inserted at end.
    
    Either wldoc or provider and URI must be provided.
    """

    if uri and provider:
        f = provider.by_uri(uri)
        text = f.read().decode('utf-8')
        f.close()
    elif wldoc is not None:
        text = etree.tostring(wldoc.edoc, encoding=unicode)
        provider = wldoc.provider
    else:
        raise ValueError('Neither a WLDocument, nor provider and URI were provided.')

    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)

    document = WLDocument.from_string(text,
                parse_dublincore=True, provider=provider)
    document.swap_endlines()

    for child_uri in document.book_info.parts:
        child = load_including_children(provider=provider, uri=child_uri)
        document.edoc.getroot().append(child.edoc.getroot())
    return document
Example #4
0
def test_transform():
    expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')

    html = WLDocument.from_file(
            get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
        ).as_html().get_string()

    assert_equal(html, file(expected_output_file_path).read())
Example #5
0
    def wldocument(self, publishable=True, changes=None, 
            parse_dublincore=True, strict=False):
        from catalogue.ebook_utils import RedakcjaDocProvider
        from librarian.parser import WLDocument

        return WLDocument.from_string(
                self.materialize(publishable=publishable, changes=changes),
                provider=RedakcjaDocProvider(publishable=publishable),
                parse_dublincore=parse_dublincore,
                strict=strict)
Example #6
0
    def wldocument(self, parse_dublincore=True, inherit=True):
        from catalogue.import_utils import ORMDocProvider
        from librarian.parser import WLDocument

        if inherit and self.parent:
            meta_fallbacks = self.parent.cover_info()
        else:
            meta_fallbacks = None

        return WLDocument.from_file(self.xml_file.path,
                provider=ORMDocProvider(self),
                parse_dublincore=parse_dublincore,
                meta_fallbacks=meta_fallbacks)
Example #7
0
    def prepare_file(cls, main_input, output_dir, verbose=False, overwrite=False):
        path, fname = os.path.realpath(main_input).rsplit('/', 1)
        provider = DirDocProvider(path)
        slug, ext = os.path.splitext(fname)

        if output_dir != '':
            makedirs(output_dir)
        outfile = os.path.join(output_dir, slug + '.' + cls.ext)
        if os.path.exists(outfile) and not overwrite:
            return

        doc = WLDocument.from_file(main_input, provider=provider)
        output_file = cls.transform(doc, cover=cls.cover, flags=cls.flags)
        doc.save_output_file(output_file, output_path=outfile)
Example #8
0
    def prepare_file(cls, main_input, output_dir, verbose=False):
        path, fname = os.path.realpath(main_input).rsplit('/', 1)
        provider = DirDocProvider(path)
        slug, ext = os.path.splitext(fname)

        if output_dir != '':
            try:
                os.makedirs(output_dir)
            except:
                pass
        outfile = os.path.join(output_dir, slug + '.' + cls.ext)

        doc = WLDocument.from_file(main_input, provider=provider)
        output_file = cls.converter.transform(doc,
                cover=cls.cover, flags=cls.flags)
        doc.save_output_file(output_file, output_path=outfile)
Example #9
0
def test_transform():
    epub = WLDocument.from_file(
        get_fixture('text', 'asnyk_zbior.xml'),
        provider=DirDocProvider(get_fixture(
            'text', ''))).as_epub(flags=['without_fonts']).get_file()
    zipf = ZipFile(epub)

    # Check contributor list.
    last = zipf.open('OPS/last.html')
    tree = html.parse(last)
    editors_attribution = False
    for par in tree.findall("//p"):
        if par.text.startswith(u'Opracowanie redakcyjne i przypisy:'):
            editors_attribution = True
            assert_equal(
                par.text.rstrip(), u'Opracowanie redakcyjne i przypisy: '
                u'Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska.')
    assert_true(editors_attribution)
Example #10
0
    def prepare_file(cls,
                     main_input,
                     output_dir,
                     verbose=False,
                     overwrite=False):
        path, fname = os.path.realpath(main_input).rsplit('/', 1)
        provider = DirDocProvider(path)
        slug, ext = os.path.splitext(fname)

        if output_dir != '':
            makedirs(output_dir)
        outfile = os.path.join(output_dir, slug + '.' + cls.ext)
        if os.path.exists(outfile) and not overwrite:
            return

        doc = WLDocument.from_file(main_input, provider=provider)
        output_file = cls.transform(doc, cover=cls.cover, flags=cls.flags)
        doc.save_output_file(output_file, output_path=outfile)
Example #11
0
def test_transform():
    epub = WLDocument.from_file(
            get_fixture('text', 'asnyk_zbior.xml'),
            provider=DirDocProvider(get_fixture('text', ''))
        ).as_epub(flags=['without_fonts']).get_file()
    zipf = ZipFile(epub)

    # Check contributor list.
    last = zipf.open('OPS/last.html')
    tree = html.parse(last)
    editors_attribution = False
    for par in tree.findall("//p"):
        if par.text.startswith(u'Opracowanie redakcyjne i przypisy:'):
            editors_attribution = True
            assert_equal(par.text.rstrip(),
                u'Opracowanie redakcyjne i przypisy: '
                u'Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska.')
    assert_true(editors_attribution)
Example #12
0
def test_empty():
    assert not WLDocument.from_string(
            '<utwor />',
            parse_dublincore=False,
        ).as_html()
Example #13
0
def test_passing_parse_dublincore_to_transform():
    """Passing parse_dublincore=False to transform omits DublinCore parsing."""
    WLDocument.from_file(
            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
            parse_dublincore=False,
        ).as_html()
Example #14
0
def test_no_dublincore():
    WLDocument.from_file(
            get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')
        ).as_html()
Example #15
0
def test_transform_hyphenate():
    epub = WLDocument.from_file(get_fixture('text', 'asnyk_zbior.xml'),
                                provider=DirDocProvider(get_fixture(
                                    'text',
                                    ''))).as_epub(flags=['without_fonts'],
                                                  hyphenate=True).get_file()
Example #16
0
    def index_content(self, book, book_fields={}):
        """
        Walks the book XML and extract content from it.
        Adds parts for each header tag and for each fragment.
        """
        wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
        root = wld.edoc.getroot()

        master = self.get_master(root)
        if master is None:
            return []

        def walker(node, ignore_tags=[]):

            if node.tag not in ignore_tags:
                yield node, None, None
                if node.text is not None:
                    yield None, node.text, None
                for child in list(node):
                    for b, t, e in walker(child):
                        yield b, t, e
                yield None, None, node

            if node.tail is not None:
                yield None, node.tail, None
            return

        def fix_format(text):
            #            separator = [u" ", u"\t", u".", u";", u","]
            if isinstance(text, list):
                # need to join it first
                text = filter(lambda s: s is not None, content)
                text = u' '.join(text)
                # for i in range(len(text)):
                #     if i > 0:
                #         if text[i][0] not in separator\
                #             and text[i - 1][-1] not in separator:
                #          text.insert(i, u" ")

            return re.sub("(?m)/$", "", text)

        def add_part(snippets, **fields):
            doc = self.create_book_doc(book)
            for n, v in book_fields.items():
                doc[n] = v

            doc['header_index'] = fields["header_index"]
            doc['header_span'] = 'header_span' in fields and fields[
                'header_span'] or 1
            doc['header_type'] = fields['header_type']

            doc['text'] = fields['text']

            # snippets
            snip_pos = snippets.add(fields["text"])

            doc['snippets_position'] = snip_pos[0]
            doc['snippets_length'] = snip_pos[1]
            if snippets.revision:
                doc["snippets_revision"] = snippets.revision

            if 'fragment_anchor' in fields:
                doc["fragment_anchor"] = fields['fragment_anchor']

            if 'themes' in fields:
                doc['themes'] = fields['themes']
            doc['uid'] = "part%s%s%s" % (doc['header_index'],
                                         doc['header_span'],
                                         doc.get('fragment_anchor', ''))
            return doc

        def give_me_utf8(s):
            if isinstance(s, unicode):
                return s.encode('utf-8')
            else:
                return s

        fragments = {}
        snippets = Snippets(book.id).open('w')
        try:
            for header, position in zip(list(master), range(len(master))):

                if header.tag in self.skip_header_tags:
                    continue
                if header.tag is etree.Comment:
                    continue

                # section content
                content = []
                footnote = []

                def all_content(text):
                    for frag in fragments.values():
                        frag['text'].append(text)
                    content.append(text)

                handle_text = [all_content]

                for start, text, end in walker(
                        header, ignore_tags=self.ignore_content_tags):
                    # handle footnotes
                    if start is not None and start.tag in self.footnote_tags:
                        footnote = []

                        def collect_footnote(t):
                            footnote.append(t)

                        handle_text.append(collect_footnote)
                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                        handle_text.pop()
                        doc = add_part(snippets,
                                       header_index=position,
                                       header_type=header.tag,
                                       text=u''.join(footnote),
                                       is_footnote=True)
                        self.index.add(doc)
                        footnote = []

                    # handle fragments and themes.
                    if start is not None and start.tag == 'begin':
                        fid = start.attrib['id'][1:]
                        fragments[fid] = {
                            'text': [],
                            'themes': [],
                            'start_section': position,
                            'start_header': header.tag
                        }

                    # themes for this fragment
                    elif start is not None and start.tag == 'motyw':
                        fid = start.attrib['id'][1:]
                        handle_text.append(None)
                        if start.text is not None:
                            fragments[fid]['themes'] += map(
                                unicode.strip,
                                map(unicode, (start.text.split(','))))
                    elif end is not None and end.tag == 'motyw':
                        handle_text.pop()

                    elif start is not None and start.tag == 'end':
                        fid = start.attrib['id'][1:]
                        if fid not in fragments:
                            continue  # a broken <end> node, skip it
                        frag = fragments[fid]
                        if frag['themes'] == []:
                            continue  # empty themes list.
                        del fragments[fid]

                        doc = add_part(snippets,
                                       header_type=frag['start_header'],
                                       header_index=frag['start_section'],
                                       header_span=position -
                                       frag['start_section'] + 1,
                                       fragment_anchor=fid,
                                       text=fix_format(frag['text']),
                                       themes=frag['themes'])
                        self.index.add(doc)

                        # Collect content.

                    if text is not None and handle_text is not []:
                        hdl = handle_text[-1]
                        if hdl is not None:
                            hdl(text)

                        # in the end, add a section text.
                doc = add_part(snippets,
                               header_index=position,
                               header_type=header.tag,
                               text=fix_format(content))

                self.index.add(doc)

        finally:
            snippets.close()
Example #17
0
def test_no_dublincore():
    WLDocument.from_file(get_fixture('text',
                                     'asnyk_miedzy_nami_nodc.xml')).as_text()
def test_annotations():
    annotations = (

        ('<pe/>', (
            'pe',
            [],
            '[przypis edytorski]',
            '<p> [przypis edytorski]</p>'
            ),
            'Empty footnote'),

        ('<pr>Definiendum --- definiens.</pr>', (
            'pr',
            [],
            'Definiendum \u2014 definiens. [przypis redakcyjny]',
            '<p>Definiendum \u2014 definiens. [przypis redakcyjny]</p>'
            ),
            'Plain footnote.'),

        ('<pt><slowo_obce>Definiendum</slowo_obce> --- definiens.</pt>', (
            'pt',
            [],
            'Definiendum \u2014 definiens. [przypis tłumacza]',
            '<p><em class="foreign-word">Definiendum</em> \u2014 definiens. [przypis tłumacza]</p>'
            ),
            'Standard footnote.'),

        ('<pr>Definiendum (łac.) --- definiens.</pr>', (
            'pr',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]',
            '<p>Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]</p>'
            ),
            'Plain footnote with qualifier'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- definiens.</pe>', (
            'pe',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens. [przypis edytorski]',
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 definiens. [przypis edytorski]</p>'
            ),
            'Standard footnote with qualifier.'),

        ('<pt> <slowo_obce>Definiendum</slowo_obce> (daw.) --- definiens.</pt>', (
            'pt',
            ['daw.'],
            'Definiendum (daw.) \u2014 definiens. [przypis tłumacza]',
            '<p> <em class="foreign-word">Definiendum</em> (daw.) \u2014 definiens. [przypis tłumacza]</p>'
            ),
            'Standard footnote with leading whitespace and qualifier.'),

        ('<pr>Definiendum (łac.) --- <slowo_obce>definiens</slowo_obce>.</pr>', (
            'pr',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]',
            '<p>Definiendum (łac.) \u2014 <em class="foreign-word">definiens</em>. [przypis redakcyjny]</p>'
            ),
            'Plain footnote with qualifier and some emphasis.'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- <slowo_obce>definiens</slowo_obce>.</pe>', (
            'pe',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens. [przypis edytorski]',
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 <em class="foreign-word">definiens</em>. [przypis edytorski]</p>'
            ),
            'Standard footnote with qualifier and some emphasis.'),

        ('<pe>Definiendum (łac.) --- definiens (some) --- more text.</pe>', (
            'pe',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens (some) \u2014 more text. [przypis edytorski]',
            '<p>Definiendum (łac.) \u2014 definiens (some) \u2014 more text. [przypis edytorski]</p>',
            ),
            'Footnote with a second parentheses and mdash.'),

        ('<pe><slowo_obce>gemajna</slowo_obce> (daw., z niem. <slowo_obce>gemein</slowo_obce>: zwykły) --- '
         'częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.</pe>', (
            'pe',
            ['daw.', 'niem.'],
            'gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, '
            'szeregowiec w wojsku polskim cudzoziemskiego autoramentu. [przypis edytorski]',
            '<p><em class="foreign-word">gemajna</em> (daw., z niem. <em class="foreign-word">gemein</em>: zwykły) '
            '\u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu. [przypis edytorski]</p>'
            ),
            'Footnote with multiple and qualifiers and emphasis.'),

    )

    xml_src = '''<utwor><akap> %s </akap></utwor>''' % "".join(
        t[0] for t in annotations)
    html = WLDocument.from_bytes(
        xml_src.encode('utf-8'),
        parse_dublincore=False).as_html().get_file()
    res_annotations = list(extract_annotations(html))

    for i, (src, expected, name) in enumerate(annotations):
        yield _test_annotation, expected, res_annotations[i], name
Example #19
0
def test_transform():
    mobi = WLDocument.from_file(
            get_fixture('text', 'asnyk_zbior.xml'),
            provider=DirDocProvider(get_fixture('text', ''))
        ).as_mobi(converter_path='true').get_file()
def test_annotations():
    annotations = (

        ('<pe/>', (
            'pe',
            [], 
            '',
            '<p></p>'
            ),
            'Empty footnote'),

        ('<pr>Definiendum --- definiens.</pr>', (
            'pr',
            [], 
            'Definiendum \u2014 definiens.', 
            '<p>Definiendum \u2014 definiens.</p>'
            ),
            'Plain footnote.'),

        ('<pt><slowo_obce>Definiendum</slowo_obce> --- definiens.</pt>', (
            'pt',
            [], 
            'Definiendum \u2014 definiens.', 
            '<p><em class="foreign-word">Definiendum</em> \u2014 definiens.</p>'
            ),
            'Standard footnote.'),

        ('<pr>Definiendum (łac.) --- definiens.</pr>', (
            'pr',
            ['łac.'], 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p>Definiendum (łac.) \u2014 definiens.</p>'
            ),
            'Plain footnote with qualifier'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- definiens.</pe>', (
            'pe',
            ['łac.'], 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 definiens.</p>'
            ),
            'Standard footnote with qualifier.'),

        ('<pt> <slowo_obce>Definiendum</slowo_obce> (daw.) --- definiens.</pt>', (
            'pt',
            ['daw.'], 
            'Definiendum (daw.) \u2014 definiens.', 
            '<p> <em class="foreign-word">Definiendum</em> (daw.) \u2014 definiens.</p>'
            ),
            'Standard footnote with leading whitespace and qualifier.'),

        ('<pr>Definiendum (łac.) --- <slowo_obce>definiens</slowo_obce>.</pr>', (
            'pr',
            ['łac.'], 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p>Definiendum (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>'
            ),
            'Plain footnote with qualifier and some emphasis.'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- <slowo_obce>definiens</slowo_obce>.</pe>', (
            'pe',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens.',
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>'
            ),
            'Standard footnote with qualifier and some emphasis.'),

        ('<pe>Definiendum (łac.) --- definiens (some) --- more text.</pe>', (
            'pe',
            ['łac.'],
            'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.',
            '<p>Definiendum (łac.) \u2014 definiens (some) \u2014 more text.</p>',
            ),
            'Footnote with a second parentheses and mdash.'),

        ('<pe><slowo_obce>gemajna</slowo_obce> (daw., z niem. <slowo_obce>gemein</slowo_obce>: zwykły) --- '
         'częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.</pe>', (
            'pe',
            ['daw.', 'niem.'],
            'gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, '
            'szeregowiec w wojsku polskim cudzoziemskiego autoramentu.',
            '<p><em class="foreign-word">gemajna</em> (daw., z niem. <em class="foreign-word">gemein</em>: zwykły) '
            '\u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.</p>'
            ),
            'Footnote with multiple and qualifiers and emphasis.'),

    )

    xml_src = '''<utwor><akap> %s </akap></utwor>''' % "".join(
        t[0] for t in annotations)
    html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file()
    res_annotations = list(extract_annotations(html))

    for i, (src, expected, name) in enumerate(annotations):
        yield _test_annotation, expected, res_annotations[i], name
Example #21
0
    def prepare(cls, input_filenames, output_dir="", verbose=False):
        from lxml import etree
        from librarian import DirDocProvider, ParseError
        from librarian.parser import WLDocument
        from copy import deepcopy
        import os
        import os.path

        xml = etree.fromstring(
            """<?xml version="1.0" encoding="utf-8"?>
            <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"></products>"""
        )
        product = etree.fromstring(
            """<product>
                <publisherProductId></publisherProductId>
                <title></title>
                <info></info>
                <description></description>
                <authors>
                    <author>
                        <names>Jan</names>
                        <lastName>Kowalski</lastName>
                    </author>
                </authors>
                <price>0.0</price>
                <language>PL</language>
            </product>"""
        )

        try:
            for main_input in input_filenames:
                if verbose:
                    print main_input
                path, fname = os.path.realpath(main_input).rsplit("/", 1)
                provider = DirDocProvider(path)
                slug, ext = os.path.splitext(fname)

                outfile_dir = os.path.join(output_dir, slug)
                os.makedirs(os.path.join(output_dir, slug))

                doc = WLDocument.from_file(main_input, provider=provider)
                info = doc.book_info

                product_elem = deepcopy(product)
                product_elem[0].text = cls.utf_trunc(slug, 100)
                product_elem[1].text = cls.utf_trunc(info.title, 255)
                product_elem[2].text = cls.utf_trunc(info.description, 255)
                product_elem[3].text = cls.utf_trunc(info.source_name, 3000)
                product_elem[4][0][0].text = cls.utf_trunc(u" ".join(info.author.first_names), 100)
                product_elem[4][0][1].text = cls.utf_trunc(info.author.last_name, 100)
                xml.append(product_elem)

                cover.VirtualoCover(info).save(os.path.join(outfile_dir, slug + ".jpg"))
                outfile = os.path.join(outfile_dir, "1.epub")
                outfile_sample = os.path.join(outfile_dir, "1.sample.epub")
                doc.save_output_file(doc.as_epub(), output_path=outfile)
                doc.save_output_file(doc.as_epub(doc, sample=25), output_path=outfile_sample)
                outfile = os.path.join(outfile_dir, "1.mobi")
                outfile_sample = os.path.join(outfile_dir, "1.sample.mobi")
                doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), output_path=outfile)
                doc.save_output_file(doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), output_path=outfile_sample)
        except ParseError, e:
            print "%(file)s:%(name)s:%(message)s" % {
                "file": main_input,
                "name": e.__class__.__name__,
                "message": e.message,
            }
Example #22
0
def test_empty():
    assert not WLDocument.from_string(
        '<utwor />',
        parse_dublincore=False,
    ).as_html()
Example #23
0
def test_transform():
    WLDocument.from_file(
            get_fixture('text', 'asnyk_zbior.xml'),
            provider=DirDocProvider(get_fixture('text', ''))
        ).as_epub(flags=['without_fonts'])
Example #24
0
    def prepare(cls, input_filenames, output_dir='', verbose=False):
        from lxml import etree
        from librarian import DirDocProvider, ParseError
        from librarian.parser import WLDocument
        from copy import deepcopy
        import os.path

        xml = etree.fromstring("""<?xml version="1.0" encoding="utf-8"?>
            <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"></products>""")
        product = etree.fromstring("""<product>
                <publisherProductId></publisherProductId>
                <title></title>
                <info></info>
                <description></description>
                <authors>
                    <author>
                        <names>Jan</names>
                        <lastName>Kowalski</lastName>
                    </author>
                </authors>
                <price>0.0</price>
                <language>PL</language>
            </product>""")

        try:
            for main_input in input_filenames:
                if verbose:
                    print main_input
                path, fname = os.path.realpath(main_input).rsplit('/', 1)
                provider = DirDocProvider(path)
                slug, ext = os.path.splitext(fname)

                outfile_dir = os.path.join(output_dir, slug)
                makedirs(os.path.join(output_dir, slug))

                doc = WLDocument.from_file(main_input, provider=provider)
                info = doc.book_info

                product_elem = deepcopy(product)
                product_elem[0].text = cls.utf_trunc(slug, 100)
                product_elem[1].text = cls.utf_trunc(info.title, 255)
                product_elem[2].text = cls.utf_trunc(info.description, 255)
                product_elem[3].text = cls.utf_trunc(info.source_name, 3000)
                product_elem[4][0][0].text = cls.utf_trunc(u' '.join(info.author.first_names), 100)
                product_elem[4][0][1].text = cls.utf_trunc(info.author.last_name, 100)
                xml.append(product_elem)

                cover.VirtualoCover(info).save(os.path.join(outfile_dir, slug+'.jpg'))
                outfile = os.path.join(outfile_dir, '1.epub')
                outfile_sample = os.path.join(outfile_dir, '1.sample.epub')
                doc.save_output_file(doc.as_epub(), output_path=outfile)
                doc.save_output_file(doc.as_epub(doc, sample=25), output_path=outfile_sample)
                outfile = os.path.join(outfile_dir, '1.mobi')
                outfile_sample = os.path.join(outfile_dir, '1.sample.mobi')
                doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), output_path=outfile)
                doc.save_output_file(
                    doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25),
                    output_path=outfile_sample)
        except ParseError, e:
            print '%(file)s:%(name)s:%(message)s' % {
                'file': main_input,
                'name': e.__class__.__name__,
                'message': e.message
            }
Example #25
0
    def index_content(self, book, book_fields={}):
        """
        Walks the book XML and extract content from it.
        Adds parts for each header tag and for each fragment.
        """
        wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
        root = wld.edoc.getroot()

        master = self.get_master(root)
        if master is None:
            return []

        def walker(node, ignore_tags=[]):

            if node.tag not in ignore_tags:
                yield node, None, None
                if node.text is not None:
                    yield None, node.text, None
                for child in list(node):
                    for b, t, e in walker(child):
                        yield b, t, e
                yield None, None, node

            if node.tail is not None:
                yield None, node.tail, None
            return

        def fix_format(text):
            #            separator = [u" ", u"\t", u".", u";", u","]
            if isinstance(text, list):
                # need to join it first
                text = filter(lambda s: s is not None, content)
                text = u' '.join(text)
                # for i in range(len(text)):
                #     if i > 0:
                #         if text[i][0] not in separator\
                #             and text[i - 1][-1] not in separator:
                #          text.insert(i, u" ")

            return re.sub("(?m)/$", "", text)

        def add_part(snippets, **fields):
            doc = self.create_book_doc(book)
            for n, v in book_fields.items():
                doc[n] = v

            doc['header_index'] = fields["header_index"]
            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
            doc['header_type'] = fields['header_type']

            doc['text'] = fields['text']

            # snippets
            snip_pos = snippets.add(fields["text"])

            doc['snippets_position'] = snip_pos[0]
            doc['snippets_length'] = snip_pos[1]
            if snippets.revision:
                doc["snippets_revision"] = snippets.revision

            if 'fragment_anchor' in fields:
                doc["fragment_anchor"] = fields['fragment_anchor']

            if 'themes' in fields:
                doc['themes'] = fields['themes']
            doc['uid'] = "part%s%s%s" % (doc['header_index'],
                                         doc['header_span'],
                                         doc.get('fragment_anchor', ''))
            return doc

        def give_me_utf8(s):
            if isinstance(s, unicode):
                return s.encode('utf-8')
            else:
                return s

        fragments = {}
        snippets = Snippets(book.id).open('w')
        try:
            for header, position in zip(list(master), range(len(master))):

                if header.tag in self.skip_header_tags:
                    continue
                if header.tag is etree.Comment:
                    continue

                # section content
                content = []
                footnote = []

                def all_content(text):
                    for frag in fragments.values():
                        frag['text'].append(text)
                    content.append(text)
                handle_text = [all_content]

                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
                    # handle footnotes
                    if start is not None and start.tag in self.footnote_tags:
                        footnote = []

                        def collect_footnote(t):
                            footnote.append(t)

                        handle_text.append(collect_footnote)
                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                        handle_text.pop()
                        doc = add_part(snippets, header_index=position, header_type=header.tag,
                                       text=u''.join(footnote),
                                       is_footnote=True)
                        self.index.add(doc)
                        footnote = []

                    # handle fragments and themes.
                    if start is not None and start.tag == 'begin':
                        fid = start.attrib['id'][1:]
                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}

                    # themes for this fragment
                    elif start is not None and start.tag == 'motyw':
                        fid = start.attrib['id'][1:]
                        handle_text.append(None)
                        if start.text is not None:
                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
                    elif end is not None and end.tag == 'motyw':
                        handle_text.pop()

                    elif start is not None and start.tag == 'end':
                        fid = start.attrib['id'][1:]
                        if fid not in fragments:
                            continue  # a broken <end> node, skip it
                        frag = fragments[fid]
                        if frag['themes'] == []:
                            continue  # empty themes list.
                        del fragments[fid]

                        doc = add_part(snippets,
                                       header_type=frag['start_header'],
                                       header_index=frag['start_section'],
                                       header_span=position - frag['start_section'] + 1,
                                       fragment_anchor=fid,
                                       text=fix_format(frag['text']),
                                       themes=frag['themes'])
                        self.index.add(doc)

                        # Collect content.

                    if text is not None and handle_text is not []:
                        hdl = handle_text[-1]
                        if hdl is not None:
                            hdl(text)

                        # in the end, add a section text.
                doc = add_part(snippets, header_index=position,
                               header_type=header.tag, text=fix_format(content))

                self.index.add(doc)

        finally:
            snippets.close()
Example #26
0
    def run(cls):
        # Parse commandline arguments
        usage = """Usage: %%prog [options] SOURCE [SOURCE...]
        Convert SOURCE files to %s format.""" % cls.format_name

        parser = optparse.OptionParser(usage=usage)

        parser.add_option('-v', '--verbose', 
                action='store_true', dest='verbose', default=False,
                help='print status messages to stdout')
        parser.add_option('-d', '--make-dir',
                action='store_true', dest='make_dir', default=False,
                help='create a directory for author and put the output file in it')
        parser.add_option('-o', '--output-file',
                dest='output_file', metavar='FILE',
                help='specifies the output file')
        parser.add_option('-O', '--output-dir',
                dest='output_dir', metavar='DIR',
                help='specifies the directory for output')
        if cls.uses_cover:
            if cls.cover_optional:
                parser.add_option('-c', '--with-cover', 
                        action='store_true', dest='with_cover', default=False,
                        help='create default cover')
            parser.add_option('-C', '--image-cache',
                    dest='image_cache', metavar='URL',
                    help='prefix for image download cache' +
                        (' (implies --with-cover)' if cls.cover_optional else ''))
        for option in cls.parser_options + cls.transform_options + cls.transform_flags:
            option.add(parser)

        options, input_filenames = parser.parse_args()

        if len(input_filenames) < 1:
            parser.print_help()
            return(1)

        # Prepare additional args for parser.
        parser_args = {}
        for option in cls.parser_options:
            parser_args[option.name()] = option.value(options)
        # Prepare additional args for transform method.
        transform_args = {}
        for option in cls.transform_options:
            transform_args[option.name()] = option.value(options)
        # Add flags to transform_args, if any.
        transform_flags = [flag.name() for flag in cls.transform_flags
                    if flag.value(options)]
        if transform_flags:
            transform_args['flags'] = transform_flags
        # Add cover support, if any.
        if cls.uses_cover:
            if options.image_cache:
                transform_args['cover'] = lambda x: WLCover(x, image_cache = options.image_cache)
            elif not cls.cover_optional or options.with_cover:
                transform_args['cover'] = WLCover


        # Do some real work
        try:
            for main_input in input_filenames:
                if options.verbose:
                    print main_input

            # Where to find input?
            if cls.uses_provider:
                path, fname = os.path.realpath(main_input).rsplit('/', 1)
                provider = DirDocProvider(path)
            else:
                provider = None

            # Where to write output?
            if not (options.output_file or options.output_dir):
                output_file = os.path.splitext(main_input)[0] + '.' + cls.ext
            else:
                output_file = None

            # Do the transformation.
            doc = WLDocument.from_file(main_input, provider=provider, **parser_args)
            transform = cls.transform
            if transform is None:
                transform = getattr(WLDocument, 'as_%s' % cls.ext)
            output = transform(doc, **transform_args)

            doc.save_output_file(output,
                output_file, options.output_dir, options.make_dir, cls.ext)

        except ParseError, e:
            print '%(file)s:%(name)s:%(message)s' % {
                'file': main_input,
                'name': e.__class__.__name__,
                'message': e
            }
def test_annotations():
    annotations = (
        ("<pe/>", ("pe", [], "", "<p></p>"), "Empty footnote"),
        (
            "<pr>Definiendum --- definiens.</pr>",
            ("pr", [], "Definiendum \u2014 definiens.", "<p>Definiendum \u2014 definiens.</p>"),
            "Plain footnote.",
        ),
        (
            "<pt><slowo_obce>Definiendum</slowo_obce> --- definiens.</pt>",
            (
                "pt",
                [],
                "Definiendum \u2014 definiens.",
                '<p><em class="foreign-word">Definiendum</em> \u2014 definiens.</p>',
            ),
            "Standard footnote.",
        ),
        (
            "<pr>Definiendum (łac.) --- definiens.</pr>",
            ("pr", ["łac."], "Definiendum (łac.) \u2014 definiens.", "<p>Definiendum (łac.) \u2014 definiens.</p>"),
            "Plain footnote with qualifier",
        ),
        (
            "<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- definiens.</pe>",
            (
                "pe",
                ["łac."],
                "Definiendum (łac.) \u2014 definiens.",
                '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 definiens.</p>',
            ),
            "Standard footnote with qualifier.",
        ),
        (
            "<pt> <slowo_obce>Definiendum</slowo_obce> (daw.) --- definiens.</pt>",
            (
                "pt",
                ["daw."],
                "Definiendum (daw.) \u2014 definiens.",
                '<p> <em class="foreign-word">Definiendum</em> (daw.) \u2014 definiens.</p>',
            ),
            "Standard footnote with leading whitespace and qualifier.",
        ),
        (
            "<pr>Definiendum (łac.) --- <slowo_obce>definiens</slowo_obce>.</pr>",
            (
                "pr",
                ["łac."],
                "Definiendum (łac.) \u2014 definiens.",
                '<p>Definiendum (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>',
            ),
            "Plain footnote with qualifier and some emphasis.",
        ),
        (
            "<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- <slowo_obce>definiens</slowo_obce>.</pe>",
            (
                "pe",
                ["łac."],
                "Definiendum (łac.) \u2014 definiens.",
                '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>',
            ),
            "Standard footnote with qualifier and some emphasis.",
        ),
        (
            "<pe>Definiendum (łac.) --- definiens (some) --- more text.</pe>",
            (
                "pe",
                ["łac."],
                "Definiendum (łac.) \u2014 definiens (some) \u2014 more text.",
                "<p>Definiendum (łac.) \u2014 definiens (some) \u2014 more text.</p>",
            ),
            "Footnote with a second parentheses and mdash.",
        ),
        (
            "<pe><slowo_obce>gemajna</slowo_obce> (daw., z niem. <slowo_obce>gemein</slowo_obce>: zwykły) --- częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.</pe>",
            (
                "pe",
                ["daw.", "niem."],
                "gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.",
                '<p><em class="foreign-word">gemajna</em> (daw., z niem. <em class="foreign-word">gemein</em>: zwykły) \u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.</p>',
            ),
            "Footnote with multiple and qualifiers and emphasis.",
        ),
    )

    xml_src = """<utwor><akap> %s </akap></utwor>""" % "".join(t[0] for t in annotations)
    html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file()
    res_annotations = list(extract_annotations(html))

    for i, (src, expected, name) in enumerate(annotations):
        yield _test_annotation, expected, res_annotations[i], name
Example #28
0
    def run(cls):
        # Parse commandline arguments
        usage = """Usage: %%prog [options] SOURCE [SOURCE...]
        Convert SOURCE files to %s format.""" % cls.format_name

        parser = optparse.OptionParser(usage=usage)

        parser.add_option('-v',
                          '--verbose',
                          action='store_true',
                          dest='verbose',
                          default=False,
                          help='print status messages to stdout')
        parser.add_option(
            '-d',
            '--make-dir',
            action='store_true',
            dest='make_dir',
            default=False,
            help='create a directory for author and put the output file in it')
        parser.add_option('-o',
                          '--output-file',
                          dest='output_file',
                          metavar='FILE',
                          help='specifies the output file')
        parser.add_option('-O',
                          '--output-dir',
                          dest='output_dir',
                          metavar='DIR',
                          help='specifies the directory for output')
        if cls.uses_cover:
            if cls.cover_optional:
                parser.add_option('-c',
                                  '--with-cover',
                                  action='store_true',
                                  dest='with_cover',
                                  default=False,
                                  help='create default cover')
            parser.add_option(
                '-C',
                '--image-cache',
                dest='image_cache',
                metavar='URL',
                help='prefix for image download cache' +
                (' (implies --with-cover)' if cls.cover_optional else ''))
        for option in cls.parser_options + cls.transform_options + cls.transform_flags:
            option.add(parser)

        options, input_filenames = parser.parse_args()

        if len(input_filenames) < 1:
            parser.print_help()
            return 1

        # Prepare additional args for parser.
        parser_args = {}
        for option in cls.parser_options:
            parser_args[option.name()] = option.value(options)
        # Prepare additional args for transform method.
        transform_args = {}
        for option in cls.transform_options:
            transform_args[option.name()] = option.value(options)
        # Add flags to transform_args, if any.
        transform_flags = [
            flag.name() for flag in cls.transform_flags if flag.value(options)
        ]
        if transform_flags:
            transform_args['flags'] = transform_flags
        if options.verbose:
            transform_args['verbose'] = True
        # Add cover support, if any.
        if cls.uses_cover:
            if options.image_cache:

                def cover_class(*args, **kwargs):
                    return DefaultEbookCover(image_cache=options.image_cache,
                                             *args,
                                             **kwargs)

                transform_args['cover'] = cover_class
            elif not cls.cover_optional or options.with_cover:
                transform_args['cover'] = DefaultEbookCover

        # Do some real work
        try:
            for main_input in input_filenames:
                if options.verbose:
                    print main_input

            # Where to find input?
            if cls.uses_provider:
                path, fname = os.path.realpath(main_input).rsplit('/', 1)
                provider = DirDocProvider(path)
            else:
                provider = None

            # Where to write output?
            if not (options.output_file or options.output_dir):
                output_file = os.path.splitext(main_input)[0] + '.' + cls.ext
            else:
                output_file = options.output_file

            # Do the transformation.
            doc = WLDocument.from_file(main_input,
                                       provider=provider,
                                       **parser_args)
            transform = cls.transform
            if transform is None:
                transform = getattr(WLDocument, 'as_%s' % cls.ext)
            output = transform(doc, **transform_args)

            doc.save_output_file(output, output_file, options.output_dir,
                                 options.make_dir, cls.ext)

        except ParseError, e:
            print '%(file)s:%(name)s:%(message)s' % {
                'file': main_input,
                'name': e.__class__.__name__,
                'message': e
            }
Example #29
0
def test_passing_parse_dublincore_to_transform():
    """Passing parse_dublincore=False to the constructor omits DublinCore parsing."""
    WLDocument.from_file(
        get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
        parse_dublincore=False,
    ).as_text()
def test_annotations():
    annotations = (

        ('<pe/>', (
            'pe',
            None, 
            '',
            '<p></p>'
            ),
            'Empty footnote'),

        (
         '<pr>Definiendum --- definiens.</pr>', (
            'pr',
            None, 
            'Definiendum \u2014 definiens.', 
            '<p>Definiendum \u2014 definiens.</p>'
            ),
            'Plain footnote.'),

        ('<pt><slowo_obce>Definiendum</slowo_obce> --- definiens.</pt>', (
            'pt',
            None, 
            'Definiendum \u2014 definiens.', 
            '<p><em class="foreign-word">Definiendum</em> \u2014 definiens.</p>'
            ),
            'Standard footnote.'),

        ('<pr>Definiendum (łac.) --- definiens.</pr>', (
            'pr',
            'łac.', 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p>Definiendum (łac.) \u2014 definiens.</p>'
            ),
            'Plain footnote with qualifier'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- definiens.</pe>', (
            'pe',
            'łac.', 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 definiens.</p>'
            ),
            'Standard footnote with qualifier.'),

        ('<pt> <slowo_obce>Definiendum</slowo_obce> (daw.) --- definiens.</pt>', (
            'pt',
            'daw.', 
            'Definiendum (daw.) \u2014 definiens.', 
            '<p> <em class="foreign-word">Definiendum</em> (daw.) \u2014 definiens.</p>'
            ),
            'Standard footnote with leading whitespace and qualifier.'),

        ('<pr>Definiendum (łac.) --- <slowo_obce>definiens</slowo_obce>.</pr>', (
            'pr',
            'łac.', 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p>Definiendum (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>'
            ),
            'Plain footnote with qualifier and some emphasis.'),

        ('<pe><slowo_obce>Definiendum</slowo_obce> (łac.) --- <slowo_obce>definiens</slowo_obce>.</pe>', (
            'pe',
            'łac.', 
            'Definiendum (łac.) \u2014 definiens.', 
            '<p><em class="foreign-word">Definiendum</em> (łac.) \u2014 <em class="foreign-word">definiens</em>.</p>'
            ),
            'Standard footnote with qualifier and some emphasis.'),

        ('<pe>Definiendum (łac.) --- definiens (some) --- more text.</pe>', (
            'pe',
            'łac.',
            'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.',
            '<p>Definiendum (łac.) \u2014 definiens (some) \u2014 more text.</p>',
            ),
            'Footnote with a second parentheses and mdash.'),

    )

    xml_src = '''<utwor><akap> %s </akap></utwor>''' % "".join(
        t[0] for t in annotations)
    html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file()
    res_annotations = list(extract_annotations(html))

    for i, (src, expected, name) in enumerate(annotations):
        yield _test_annotation, expected, res_annotations[i], name