Esempio n. 1
0
    def parse_book(self, base_dir):

        self.parse_index(base_dir)

        (base_dir / "parsed_index.tsv").save(index)

        self._check_index(index)

        o_book = oaktree.Leaf('book')

        prev_depth = 0
        prev_node = o_book

        for num, title, ident in index:
            depth = len(num)

            txt = (base_dir / "part" / f'{ident:04d}').read_text()
            mcp = self.expand_shortcut(txt)
            o_section = self.parse_section(mcp)
            o_title = oaktree.Leaf('title').add_text(
                title if title != '~' else '')

            sub_lst = o_section.sub
            o_section.sub = list()
            o_section.attach(o_title, *sub_lst)

            curr_node = prev_node.ancestor_lst[depth - 1]
            curr_node.attach(o_section)

            prev_depth = depth
            prev_node = o_section

        return o_book
Esempio n. 2
0
    def parse_alinea(self, txt, tag='alinea'):
        """
		txt: should not contain any line feed
		tag: can be something else than 'alinea'. most of the time it will be 'li'
		"""

        o_alinea = oaktree.Leaf(tag)

        res = alinea_ident_rec.search(txt)
        if res is not None:
            # if the alinea ident exists, parse it and cut it
            o_alinea.ident = int(res.group('ident'))
            txt = txt[:res.start()]

        txt = txt.rstrip()

        # the first level atoms have already been parsed
        prev = None
        for res in atom_line_rec.finditer(txt):
            curr = res.start()
            s = txt[prev:curr]
            if s.strip():
                o_alinea.add_text(s)
            o_atom = self.parse_atom(res, False)
            o_alinea.attach(o_atom)
            prev = res.end()
        s = txt[prev:None]
        if s.strip():
            o_alinea.add_text(s)

        return o_alinea
Esempio n. 3
0
    def _get_section(self, *pos, **nam):
        if 'b' not in nam or 's' not in nam:
            raise cherrypy.HTTPError(400)

        if nam['b'] not in self.shelf.book_set:
            raise cherrypy.HTTPError(404)

        return self.proxy._get_section(nam['b'], int(nam['s']))

        s = int(nam['s'])
        b = self.shelf[nam['b']]
        t = (b.base_dir / 'part' / f'{s:04d}').read_text()
        e = b.expand_shortcut(t)

        Path(b.base_dir / ".tmp" / f'{s:04d}.expanded.bkt').write_text(e)

        o = b.parse_section(e)

        g = oaktree.proxy.braket.BraketProxy()
        k = oaktree.proxy.braket.BraketProxy(indent='')

        g.save(o, Path(b.base_dir / ".tmp" / f'{s:04d}.parsed.bkt'))
        k.save(o, Path(b.base_dir / ".tmp" / f'{s:04d}.parsednoindent.bkt'))

        h = oaktree.Leaf('tmp')
        self.to_html5.compose(o, h)

        g.save(h.sub[0], Path(b.base_dir / ".tmp" / f'{s:04d}.composed.bkt'))
        k.save(h.sub[0],
               Path(b.base_dir / ".tmp" / f'{s:04d}.composednoindent.bkt'))

        f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True)
        f.save(h.sub[0], Path(b.base_dir / ".tmp" / f'{s:04d}.result.html'))

        return f.save(h.sub[0])
Esempio n. 4
0
    def _prep_section(self, book_key, ident):

        base_dir = self.repo_dir / book_key

        b = marccup.parser.generic.GenericParser()
        u = marccup.composer.html5.Html5Composer()

        t = (base_dir / 'part' / f'{ident:04d}').read_text()

        o = b.parse(t)

        g = oaktree.proxy.braket.BraketProxy()
        k = oaktree.proxy.braket.BraketProxy(indent='')

        g.save(o, Path(base_dir / ".tmp" / f'{ident:04d}.parsed.bkt'))
        #k.save(o, Path( base_dir / ".tmp" / f'{ident:04d}.parsednoindent.bkt'))

        h = oaktree.Leaf('tmp')
        u.compose(o, h)

        #g.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.composed.bkt'))
        #k.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.composednoindent.bkt'))

        f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True)
        #f.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.result.html'))
        f.save(h.sub[0], Path(base_dir / ".cache" / "part" / f'{ident:04d}'))

        return f.save(h.sub[0])
Esempio n. 5
0
    def parse_paragraph(self, paragraph_txt):
        """ 
		txt consists in, either :

			* some alineas
			* a single atom
			* a bullet list
		
		"""

        atom_block_res = atom_block_rec.match(paragraph_txt)
        if atom_block_res is not None:
            # the paragraph consists in a sole atom, with possibly an ident
            return self.parse_atom(atom_block_res, True)

            # atom = self.atom_map[int(atom_block_res.group('atom_n'))]
            # if atom.tag == "table" :
            # 	o_block = self.parse_table(atom.content)
            # elif atom.tag == "math" :
            # 	# easy, let's do it now
            # 	o_block = oaktree.Leaf('math', flag={'block'}).add_text(atom.content[0].strip())
            # else :
            # 	o_block = self.parse_alinea('|'.join(atom.content), atom.tag)
            # 	o_block.flag.add('block')

            # if atom_block_res.group('ident') is not None :
            # 	o_block.ident = atom_block_res.group('ident').strip()

            # return o_block

        else:
            # the paragraph is made of alineas, bullet or normal
            for alinea_txt in paragraph_txt.splitlines():
                # print(alinea_txt)
                # let's check that is is a not a bullet of numbered list
                if not bullet_list_rec.match(alinea_txt):
                    # there is one normal alinea inside, not a bullet list
                    break
            else:
                # only bullets ! let's parse it
                return self.parse_list(paragraph_txt)

        # no bullet
        o_block = oaktree.Leaf('paragraph')
        alinea_lst = paragraph_txt.splitlines()

        # if the last line is a paragraph ident, parse it, and pop it
        res = paragraph_ident_rec.match(alinea_lst[-1])
        if res is not None:
            o_block.ident = int(res.group('ident'))
            alinea_lst.pop(-1)

        for alinea_txt in alinea_lst:
            o_line = self.parse_alinea(alinea_txt)
            o_block.attach(o_line)

        return o_block
Esempio n. 6
0
    def parse_list(self, txt):

        prev_indent = -1

        o_root = None
        for n, line in enumerate(txt.splitlines()):

            res = bullet_list_rec.search(line)
            indent = len(res.group('tabs'))

            to_grow = False
            if indent == prev_indent:
                # same level, nothing to add, o_list should exists
                pass
            elif indent == prev_indent + 1:
                # new indentation level, will create a new ol/ul group
                to_grow = True
            elif indent < prev_indent:
                # reduced indentation level
                o_list = o_list.parent_n(2 * (prev_indent - indent))
            else:
                # on rattrappe l'indentation si le premier est également indenté ( ce qui n'est pas très standard )
                to_grow = True

            if to_grow:
                if res.group('marker') == '*':
                    o_list = oaktree.Leaf('ul')
                elif res.group('marker') == '#':
                    o_list = oaktree.Leaf('ol')
                else:
                    raise ValueError()
                if o_root is None:
                    o_root = o_list
                else:
                    o_alinea.attach(o_list)

            o_alinea = self.parse_alinea(res.group('line'), 'li')

            o_list.attach(o_alinea)

            prev_indent = indent

        return o_root
Esempio n. 7
0
	def parse_document(self, root_dir) :
		o_doc = oaktree.Leaf("doc")
		chapter_lst = [o_doc, None, None, None, None]
		for indent, section in (root_dir / "__doc__.tsv").load() :
			indent, section = int(indent), int(section)
			o_section = chapter_lst[indent - 1].grow('section')
			chapter_lst[indent] = o_section
			txt = (root_dir / f"{section}.bkt").read_text()
			self.parse_section(o_section, txt)
		return o_doc
Esempio n. 8
0
    def parse_atom(self, atom_res, is_block):

        # the paragraph consists in a sole atom, with possibly an ident
        atom = self.atom_map[int(atom_res.group('atom_n'))]
        if atom.tag == "table":
            o_block = self._parse_atom_table(atom.content)
        elif atom.tag == "math":
            if is_block:
                o_block = oaktree.Leaf('math', flag={'block'})
            else:
                o_block = oaktree.Leaf('math')
            o_block.add_text(atom.content[0].strip())
        else:
            o_block = self.parse_alinea('|'.join(atom.content), atom.tag)
            o_block.flag.add('block')

        if is_block and atom_res.group('ident') is not None:
            o_block.ident = int(atom_res.group('ident').strip())

        return o_block
Esempio n. 9
0
    def _parse_atom_table(self, txt):

        o_table = oaktree.Leaf('table')

        # a bit of cleaning, such as the row separator is really the sole marker on its line
        txt = '\n'.join(
            (line.strip() if table_split_rec.match(line) is not None else line)
            for line in '|'.join(txt).splitlines())

        txt = self.protect_atom(txt)

        if table_split_rec.search(txt) is not None:
            row_lst = table_split_rec.split(txt)
        else:
            row_lst = [line for line in txt.splitlines() if line.strip()]

        for row in row_lst:
            o_row = o_table.grow('table_row')
            for cell in row.split('|'):

                cell = cell.strip()

                # look for row or col span clues
                table_span_res = table_span_rec.match(cell)
                if table_span_res is not None:
                    row_n = table_span_res.group('row_n')
                    if row_n is not None:
                        o_cell.nam["rspan"] = row_n
                    col_n = table_span_res.group('col_n')
                    if col_n is not None:
                        o_cell.nam["cspan"] = col_n
                    cell = cell[table_span_res.end():]

                # look for header clue
                if cell.startswith('='):
                    is_header = True
                    cell = cell[1:].lstrip()
                else:
                    is_header = False

                o_cell = o_row.grow('table_cell')

                o_content = self.parse(cell, True)
                o_cell.attach(*o_content.sub)

                if is_header:
                    o_cell.flag.add('header')

        return o_table
Esempio n. 10
0
    def _parse_section(self, txt, tag='section'):
        """ a section is a part of text which contains many paragraphs """

        initial_txt = txt

        o_section = oaktree.Leaf(tag)

        # protect higher level atoms and cleanup
        txt = self.protect_atom(txt)
        txt = self.clean_lines(txt)

        for paragraph_txt in txt.split('\n\n'):
            o_block = self.parse_paragraph(paragraph_txt)
            o_section.attach(o_block)

        self.dbg(f'GenericParser.parse_section.bkt', initial_txt,
                 BraketProxy().save(o_section.root))

        return o_section
Esempio n. 11
0
    def mcp_to_html(self, mcp_txt, debug_dir=None):

        b = marccup.MarccupParser(debug_dir)
        o_section = b.parse_section(mcp_txt)

        o_container = oaktree.Leaf('tmp')
        u = spext.composer.html5.Html5Composer__base__()
        u.compose(o_section, o_container)

        f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True)
        html_txt = f.save(o_container.sub[0])

        if debug_dir:
            g = oaktree.proxy.braket.BraketProxy()
            g.save(o_section, debug_dir / '4_parsed.bkt')

            (debug_dir / "5_composed.html").write_text(
                html_txt.replace('><', '>\n<'))

        return html_txt
Esempio n. 12
0
    def parse_page(self, txt):
        o_page = oaktree.Leaf('page')

        prev_depth = 0
        prev_node = o_page

        prev_res = None
        stack = list()

        for line in txt.splitlines():
            title_res = title_rec.match(line)
            if title_res is None:
                # push the line on the stack
                stack.append(line)
            else:
                if prev_res is None:  # no title previously found, this must be some introductory text
                    o_section = self.parse_section('\n'.join(stack))
                    prev_node.attach(o_section)
                else:
                    o_section = self.parse_section('\n'.join(stack), prev_res)
                    depth = len(prev_res.group('depth'))

                    if prev_depth + 1 < depth:
                        raise ValueError

                    curr_node = prev_node.ancestor_lst[depth - 1]
                    curr_node.attach(o_section)

                    prev_depth = depth
                    prev_node = o_section

                prev_res = title_res
                stack = [
                    line,
                ]

        return o_page
Esempio n. 13
0
#!/usr/bin/env python3

import sys

from cc_pathlib import Path

import oaktree
import marccup

section_name = sys.argv[1]
section_pth = Path("document") / f"{section_name}.bkt"
section_txt = section_pth.read_text()

u = marccup.Parser(debug_dir=Path("tmp"))
p = oaktree.Leaf("section")
u.parse_section(p, section_txt)

v = marccup.Composer(p, Path("tmp/result.html"))


Esempio n. 14
0
        n = '\n' if self.indent else ''

        w(i + '<' + ' '.join(s) + ('>' if node.sub else ' />') + n)

        for k in node.sub:
            if isinstance(k, oaktree.Leaf):
                self.compose(k, w, depth + 1)
            else:
                w(str(k) + n)

        if node.sub:
            w(f'{i}</{t}>{n}')


if __name__ == '__main__':
    import oaktree

    u = oaktree.Leaf('tutu',
                     nam={
                         'vache': "meuh",
                         'canard': "coincoin"
                     },
                     style="animal")
    g = u.grow('toto', flag="eurhm")
    g.add_text("bizarre")
    g.grow('tata')
    g.add_text("vouzavé dit bizarre")

    x = Html5Proxy()
    print(x.save(u))
Esempio n. 15
0
#!/usr/bin/env python3

from cc_pathlib import Path

import oaktree

from oaktree.proxy.braket import BraketProxy

u1 = oaktree.Leaf("one")
u11 = u1.grow("one-one")
u12 = u1.grow("one-two")
u12.add_text("first line")
u12.add_text("second line")

BraketProxy().save(u1, Path("test.bkt"))