def test_transform(): temp = NamedTemporaryFile(delete=False) temp.close() WLDocument.from_file( get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture('text', '')) ).as_pdf(save_tex=temp.name) tex = open(temp.name).read().decode('utf-8') print tex # Check contributor list. editors = re.search(ur'\\def\\editors\{Opracowanie redakcyjne i przypisy: ([^}]*?)\.\s*\}', tex) assert_equal(editors.group(1), u"Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska")
def test_transform(): temp = NamedTemporaryFile(delete=False) temp.close() WLDocument.from_file(get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture( 'text', ''))).as_pdf(save_tex=temp.name) tex = open(temp.name, 'rb').read().decode('utf-8') # Check contributor list. editors = re.search( r'\\def\\editors\{Opracowanie redakcyjne i przypisy: ([^}]*?)\.\s*\}', tex) assert_equal(editors.group(1), u"Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska")
def wldocument(self, parse_dublincore=True): from catalogue.import_utils import ORMDocProvider from librarian.parser import WLDocument return WLDocument.from_file(self.xml_file.path, provider=ORMDocProvider(self), parse_dublincore=parse_dublincore)
def test_transform(): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html') html = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') ).as_html().get_string() assert_equal(html, file(expected_output_file_path).read())
def test_transform(): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html') html = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') ).as_html().get_bytes() assert_equal(html, open(expected_output_file_path, 'rb').read())
def test_transform(): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt') text = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')).as_text().get_string() assert_equal(text, file(expected_output_file_path).read())
def test_transform_raw(): expected_output_file_path = get_fixture( 'text', 'asnyk_miedzy_nami_expected_raw.txt') text = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')).as_text( flags=['raw-text']).get_bytes() assert_equal(text, open(expected_output_file_path, 'rb').read())
def wldocument(self, parse_dublincore=True, inherit=True): from catalogue.import_utils import ORMDocProvider from librarian.parser import WLDocument if inherit and self.parent: meta_fallbacks = self.parent.cover_info() else: meta_fallbacks = None return WLDocument.from_file(self.xml_file.path, provider=ORMDocProvider(self), parse_dublincore=parse_dublincore, meta_fallbacks=meta_fallbacks)
def prepare_file(cls, main_input, output_dir, verbose=False, overwrite=False): path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) if output_dir != '': makedirs(output_dir) outfile = os.path.join(output_dir, slug + '.' + cls.ext) if os.path.exists(outfile) and not overwrite: return doc = WLDocument.from_file(main_input, provider=provider) output_file = cls.transform(doc, cover=cls.cover, flags=cls.flags) doc.save_output_file(output_file, output_path=outfile)
def prepare_file(cls, main_input, output_dir, verbose=False): path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) if output_dir != '': try: os.makedirs(output_dir) except: pass outfile = os.path.join(output_dir, slug + '.' + cls.ext) doc = WLDocument.from_file(main_input, provider=provider) output_file = cls.converter.transform(doc, cover=cls.cover, flags=cls.flags) doc.save_output_file(output_file, output_path=outfile)
def test_transform(): epub = WLDocument.from_file( get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture('text', '')) ).as_epub(flags=['without_fonts']).get_file() zipf = ZipFile(epub) # Check contributor list. last = zipf.open('OPS/last.html') tree = html.parse(last) editors_attribution = False for par in tree.findall("//p"): if par.text.startswith(u'Opracowanie redakcyjne i przypisy:'): editors_attribution = True assert_equal(par.text.rstrip(), u'Opracowanie redakcyjne i przypisy: ' u'Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska.') assert_true(editors_attribution)
def test_transform(): epub = WLDocument.from_file( get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture( 'text', ''))).as_epub(flags=['without_fonts']).get_file() zipf = ZipFile(epub) # Check contributor list. last = zipf.open('OPS/last.html') tree = html.parse(last) editors_attribution = False for par in tree.findall("//p"): if par.text.startswith(u'Opracowanie redakcyjne i przypisy:'): editors_attribution = True assert_equal( par.text.rstrip(), u'Opracowanie redakcyjne i przypisy: ' u'Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska.') assert_true(editors_attribution)
def test_transform(): mobi = WLDocument.from_file( get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture('text', '')) ).as_mobi(converter_path='true').get_file()
def test_transform_hyphenate(): epub = WLDocument.from_file(get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture( 'text', ''))).as_epub(flags=['without_fonts'], hyphenate=True).get_file()
def test_passing_parse_dublincore_to_transform(): """Passing parse_dublincore=False to transform omits DublinCore parsing.""" WLDocument.from_file( get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), parse_dublincore=False, ).as_html()
def test_no_dublincore(): WLDocument.from_file( get_fixture('text', 'asnyk_miedzy_nami_nodc.xml') ).as_html()
def prepare(cls, input_filenames, output_dir='', verbose=False): from lxml import etree from librarian import DirDocProvider, ParseError from librarian.parser import WLDocument from copy import deepcopy import os.path xml = etree.fromstring("""<?xml version="1.0" encoding="utf-8"?> <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"></products>""") product = etree.fromstring("""<product> <publisherProductId></publisherProductId> <title></title> <info></info> <description></description> <authors> <author> <names>Jan</names> <lastName>Kowalski</lastName> </author> </authors> <price>0.0</price> <language>PL</language> </product>""") try: for main_input in input_filenames: if verbose: print main_input path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) outfile_dir = os.path.join(output_dir, slug) makedirs(os.path.join(output_dir, slug)) doc = WLDocument.from_file(main_input, provider=provider) info = doc.book_info product_elem = deepcopy(product) product_elem[0].text = cls.utf_trunc(slug, 100) product_elem[1].text = cls.utf_trunc(info.title, 255) product_elem[2].text = cls.utf_trunc(info.description, 255) product_elem[3].text = cls.utf_trunc(info.source_name, 3000) product_elem[4][0][0].text = cls.utf_trunc(u' '.join(info.author.first_names), 100) product_elem[4][0][1].text = cls.utf_trunc(info.author.last_name, 100) xml.append(product_elem) cover.VirtualoCover(info).save(os.path.join(outfile_dir, slug+'.jpg')) outfile = os.path.join(outfile_dir, '1.epub') outfile_sample = os.path.join(outfile_dir, '1.sample.epub') doc.save_output_file(doc.as_epub(), output_path=outfile) doc.save_output_file(doc.as_epub(doc, sample=25), output_path=outfile_sample) outfile = os.path.join(outfile_dir, '1.mobi') outfile_sample = os.path.join(outfile_dir, '1.sample.mobi') doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), output_path=outfile) doc.save_output_file( doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), output_path=outfile_sample) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e.message }
def index_content(self, book, book_fields={}): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. """ wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False) root = wld.edoc.getroot() master = self.get_master(root) if master is None: return [] def walker(node, ignore_tags=[]): if node.tag not in ignore_tags: yield node, None, None if node.text is not None: yield None, node.text, None for child in list(node): for b, t, e in walker(child): yield b, t, e yield None, None, node if node.tail is not None: yield None, node.tail, None return def fix_format(text): # separator = [u" ", u"\t", u".", u";", u","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) text = u' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: # text.insert(i, u" ") return re.sub("(?m)/$", "", text) def add_part(snippets, **fields): doc = self.create_book_doc(book) for n, v in book_fields.items(): doc[n] = v doc['header_index'] = fields["header_index"] doc['header_span'] = 'header_span' in fields and fields[ 'header_span'] or 1 doc['header_type'] = fields['header_type'] doc['text'] = fields['text'] # snippets snip_pos = snippets.add(fields["text"]) doc['snippets_position'] = snip_pos[0] doc['snippets_length'] = snip_pos[1] if snippets.revision: doc["snippets_revision"] = snippets.revision if 'fragment_anchor' in fields: doc["fragment_anchor"] = fields['fragment_anchor'] if 'themes' in fields: doc['themes'] = fields['themes'] doc['uid'] = "part%s%s%s" % (doc['header_index'], doc['header_span'], doc.get('fragment_anchor', '')) return doc def give_me_utf8(s): if isinstance(s, unicode): return s.encode('utf-8') else: return s fragments = {} snippets = Snippets(book.id).open('w') try: for header, position in zip(list(master), range(len(master))): if header.tag in self.skip_header_tags: continue if header.tag is etree.Comment: continue # section content content = [] footnote = [] def all_content(text): for frag in fragments.values(): frag['text'].append(text) content.append(text) handle_text = [all_content] for start, text, end in walker( header, ignore_tags=self.ignore_content_tags): # handle footnotes if start is not None and start.tag in self.footnote_tags: footnote = [] def collect_footnote(t): footnote.append(t) handle_text.append(collect_footnote) elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, text=u''.join(footnote), is_footnote=True) self.index.add(doc) footnote = [] # handle fragments and themes. if start is not None and start.tag == 'begin': fid = start.attrib['id'][1:] fragments[fid] = { 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag } # themes for this fragment elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] handle_text.append(None) if start.text is not None: fragments[fid]['themes'] += map( unicode.strip, map(unicode, (start.text.split(',')))) elif end is not None and end.tag == 'motyw': handle_text.pop() elif start is not None and start.tag == 'end': fid = start.attrib['id'][1:] if fid not in fragments: continue # a broken <end> node, skip it frag = fragments[fid] if frag['themes'] == []: continue # empty themes list. del fragments[fid] doc = add_part(snippets, header_type=frag['start_header'], header_index=frag['start_section'], header_span=position - frag['start_section'] + 1, fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) self.index.add(doc) # Collect content. if text is not None and handle_text is not []: hdl = handle_text[-1] if hdl is not None: hdl(text) # in the end, add a section text. doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) self.index.add(doc) finally: snippets.close()
def run(cls): # Parse commandline arguments usage = """Usage: %%prog [options] SOURCE [SOURCE...] Convert SOURCE files to %s format.""" % cls.format_name parser = optparse.OptionParser(usage=usage) parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='print status messages to stdout') parser.add_option( '-d', '--make-dir', action='store_true', dest='make_dir', default=False, help='create a directory for author and put the output file in it') parser.add_option('-o', '--output-file', dest='output_file', metavar='FILE', help='specifies the output file') parser.add_option('-O', '--output-dir', dest='output_dir', metavar='DIR', help='specifies the directory for output') if cls.uses_cover: if cls.cover_optional: parser.add_option('-c', '--with-cover', action='store_true', dest='with_cover', default=False, help='create default cover') parser.add_option( '-C', '--image-cache', dest='image_cache', metavar='URL', help='prefix for image download cache' + (' (implies --with-cover)' if cls.cover_optional else '')) for option in cls.parser_options + cls.transform_options + cls.transform_flags: option.add(parser) options, input_filenames = parser.parse_args() if len(input_filenames) < 1: parser.print_help() return 1 # Prepare additional args for parser. parser_args = {} for option in cls.parser_options: parser_args[option.name()] = option.value(options) # Prepare additional args for transform method. transform_args = {} for option in cls.transform_options: transform_args[option.name()] = option.value(options) # Add flags to transform_args, if any. transform_flags = [ flag.name() for flag in cls.transform_flags if flag.value(options) ] if transform_flags: transform_args['flags'] = transform_flags if options.verbose: transform_args['verbose'] = True # Add cover support, if any. if cls.uses_cover: if options.image_cache: def cover_class(*args, **kwargs): return DefaultEbookCover(image_cache=options.image_cache, *args, **kwargs) transform_args['cover'] = cover_class elif not cls.cover_optional or options.with_cover: transform_args['cover'] = DefaultEbookCover # Do some real work try: for main_input in input_filenames: if options.verbose: print main_input # Where to find input? if cls.uses_provider: path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) else: provider = None # Where to write output? if not (options.output_file or options.output_dir): output_file = os.path.splitext(main_input)[0] + '.' + cls.ext else: output_file = options.output_file # Do the transformation. doc = WLDocument.from_file(main_input, provider=provider, **parser_args) transform = cls.transform if transform is None: transform = getattr(WLDocument, 'as_%s' % cls.ext) output = transform(doc, **transform_args) doc.save_output_file(output, output_file, options.output_dir, options.make_dir, cls.ext) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e }
def test_no_dublincore(): WLDocument.from_file(get_fixture('text', 'asnyk_miedzy_nami_nodc.xml')).as_text()
def index_content(self, book, book_fields={}): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. """ wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False) root = wld.edoc.getroot() master = self.get_master(root) if master is None: return [] def walker(node, ignore_tags=[]): if node.tag not in ignore_tags: yield node, None, None if node.text is not None: yield None, node.text, None for child in list(node): for b, t, e in walker(child): yield b, t, e yield None, None, node if node.tail is not None: yield None, node.tail, None return def fix_format(text): # separator = [u" ", u"\t", u".", u";", u","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) text = u' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: # text.insert(i, u" ") return re.sub("(?m)/$", "", text) def add_part(snippets, **fields): doc = self.create_book_doc(book) for n, v in book_fields.items(): doc[n] = v doc['header_index'] = fields["header_index"] doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1 doc['header_type'] = fields['header_type'] doc['text'] = fields['text'] # snippets snip_pos = snippets.add(fields["text"]) doc['snippets_position'] = snip_pos[0] doc['snippets_length'] = snip_pos[1] if snippets.revision: doc["snippets_revision"] = snippets.revision if 'fragment_anchor' in fields: doc["fragment_anchor"] = fields['fragment_anchor'] if 'themes' in fields: doc['themes'] = fields['themes'] doc['uid'] = "part%s%s%s" % (doc['header_index'], doc['header_span'], doc.get('fragment_anchor', '')) return doc def give_me_utf8(s): if isinstance(s, unicode): return s.encode('utf-8') else: return s fragments = {} snippets = Snippets(book.id).open('w') try: for header, position in zip(list(master), range(len(master))): if header.tag in self.skip_header_tags: continue if header.tag is etree.Comment: continue # section content content = [] footnote = [] def all_content(text): for frag in fragments.values(): frag['text'].append(text) content.append(text) handle_text = [all_content] for start, text, end in walker(header, ignore_tags=self.ignore_content_tags): # handle footnotes if start is not None and start.tag in self.footnote_tags: footnote = [] def collect_footnote(t): footnote.append(t) handle_text.append(collect_footnote) elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, text=u''.join(footnote), is_footnote=True) self.index.add(doc) footnote = [] # handle fragments and themes. if start is not None and start.tag == 'begin': fid = start.attrib['id'][1:] fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag} # themes for this fragment elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] handle_text.append(None) if start.text is not None: fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(',')))) elif end is not None and end.tag == 'motyw': handle_text.pop() elif start is not None and start.tag == 'end': fid = start.attrib['id'][1:] if fid not in fragments: continue # a broken <end> node, skip it frag = fragments[fid] if frag['themes'] == []: continue # empty themes list. del fragments[fid] doc = add_part(snippets, header_type=frag['start_header'], header_index=frag['start_section'], header_span=position - frag['start_section'] + 1, fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) self.index.add(doc) # Collect content. if text is not None and handle_text is not []: hdl = handle_text[-1] if hdl is not None: hdl(text) # in the end, add a section text. doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) self.index.add(doc) finally: snippets.close()
def run(cls): # Parse commandline arguments usage = """Usage: %%prog [options] SOURCE [SOURCE...] Convert SOURCE files to %s format.""" % cls.format_name parser = optparse.OptionParser(usage=usage) parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='print status messages to stdout') parser.add_option('-d', '--make-dir', action='store_true', dest='make_dir', default=False, help='create a directory for author and put the output file in it') parser.add_option('-o', '--output-file', dest='output_file', metavar='FILE', help='specifies the output file') parser.add_option('-O', '--output-dir', dest='output_dir', metavar='DIR', help='specifies the directory for output') if cls.uses_cover: if cls.cover_optional: parser.add_option('-c', '--with-cover', action='store_true', dest='with_cover', default=False, help='create default cover') parser.add_option('-C', '--image-cache', dest='image_cache', metavar='URL', help='prefix for image download cache' + (' (implies --with-cover)' if cls.cover_optional else '')) for option in cls.parser_options + cls.transform_options + cls.transform_flags: option.add(parser) options, input_filenames = parser.parse_args() if len(input_filenames) < 1: parser.print_help() return(1) # Prepare additional args for parser. parser_args = {} for option in cls.parser_options: parser_args[option.name()] = option.value(options) # Prepare additional args for transform method. transform_args = {} for option in cls.transform_options: transform_args[option.name()] = option.value(options) # Add flags to transform_args, if any. transform_flags = [flag.name() for flag in cls.transform_flags if flag.value(options)] if transform_flags: transform_args['flags'] = transform_flags # Add cover support, if any. if cls.uses_cover: if options.image_cache: transform_args['cover'] = lambda x: WLCover(x, image_cache = options.image_cache) elif not cls.cover_optional or options.with_cover: transform_args['cover'] = WLCover # Do some real work try: for main_input in input_filenames: if options.verbose: print main_input # Where to find input? if cls.uses_provider: path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) else: provider = None # Where to write output? if not (options.output_file or options.output_dir): output_file = os.path.splitext(main_input)[0] + '.' + cls.ext else: output_file = None # Do the transformation. doc = WLDocument.from_file(main_input, provider=provider, **parser_args) transform = cls.transform if transform is None: transform = getattr(WLDocument, 'as_%s' % cls.ext) output = transform(doc, **transform_args) doc.save_output_file(output, output_file, options.output_dir, options.make_dir, cls.ext) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e }
def prepare(cls, input_filenames, output_dir="", verbose=False): from lxml import etree from librarian import DirDocProvider, ParseError from librarian.parser import WLDocument from copy import deepcopy import os import os.path xml = etree.fromstring( """<?xml version="1.0" encoding="utf-8"?> <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"></products>""" ) product = etree.fromstring( """<product> <publisherProductId></publisherProductId> <title></title> <info></info> <description></description> <authors> <author> <names>Jan</names> <lastName>Kowalski</lastName> </author> </authors> <price>0.0</price> <language>PL</language> </product>""" ) try: for main_input in input_filenames: if verbose: print main_input path, fname = os.path.realpath(main_input).rsplit("/", 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) outfile_dir = os.path.join(output_dir, slug) os.makedirs(os.path.join(output_dir, slug)) doc = WLDocument.from_file(main_input, provider=provider) info = doc.book_info product_elem = deepcopy(product) product_elem[0].text = cls.utf_trunc(slug, 100) product_elem[1].text = cls.utf_trunc(info.title, 255) product_elem[2].text = cls.utf_trunc(info.description, 255) product_elem[3].text = cls.utf_trunc(info.source_name, 3000) product_elem[4][0][0].text = cls.utf_trunc(u" ".join(info.author.first_names), 100) product_elem[4][0][1].text = cls.utf_trunc(info.author.last_name, 100) xml.append(product_elem) cover.VirtualoCover(info).save(os.path.join(outfile_dir, slug + ".jpg")) outfile = os.path.join(outfile_dir, "1.epub") outfile_sample = os.path.join(outfile_dir, "1.sample.epub") doc.save_output_file(doc.as_epub(), output_path=outfile) doc.save_output_file(doc.as_epub(doc, sample=25), output_path=outfile_sample) outfile = os.path.join(outfile_dir, "1.mobi") outfile_sample = os.path.join(outfile_dir, "1.sample.mobi") doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), output_path=outfile) doc.save_output_file(doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), output_path=outfile_sample) except ParseError, e: print "%(file)s:%(name)s:%(message)s" % { "file": main_input, "name": e.__class__.__name__, "message": e.message, }
def test_passing_parse_dublincore_to_transform(): """Passing parse_dublincore=False to the constructor omits DublinCore parsing.""" WLDocument.from_file( get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), parse_dublincore=False, ).as_text()
def test_transform(): WLDocument.from_file( get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture('text', '')) ).as_epub(flags=['without_fonts'])