Ejemplo n.º 1
0
    def __init__(self):
        self._markdown_parser = MarkdownParser()

        self._meta_parsers = {}
        for c in get_all_classes(["meta_parsers.py"], MetaDataParser):
            obj = c()
            self._meta_parsers[obj.flag] = obj

        self._file_path = ""
Ejemplo n.º 2
0
    def add_document(self, writer, file_path, config):
        file_name = str(
            file_path.replace(".", " ").replace("/", " ").replace(
                "\\", " ").replace("_", " ").replace("-", " "))
        # read file content
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            path = str(file_path)

        # parse markdown fields
        parser = MarkdownParser()
        parser.parse(content, config)

        modtime = os.path.getmtime(path)
        print(
            "adding to index: path: %s size:%d tags:'%s' headlines:'%s' modtime=%d"
            % (path, len(content), parser.tags, parser.headlines, modtime))
        writer.add_document(path=path,
                            filename=file_name,
                            headlines=parser.headlines,
                            tags=parser.tags,
                            content=content,
                            doubleemphasiswords=parser.doubleemphasiswords,
                            emphasiswords=parser.emphasiswords,
                            time=modtime)
Ejemplo n.º 3
0
class Parser(object):
    """
    Parser for blog page files.
    """
    def __init__(self):
        self._markdown_parser = MarkdownParser()

        self._meta_parsers = {}
        for c in get_all_classes(["meta_parsers.py"], MetaDataParser):
            obj = c()
            self._meta_parsers[obj.flag] = obj

        self._file_path = ""

    def _split_meta_and_content(self, text):
        tmp = re.match(r"([\s\S]*?)\n\n([\s\S]*)", text.replace("\r\n", "\n"))
        if not tmp:
            return False
        return tmp.groups()

    def _meta_parse(self, metas):
        tmp = {}
        for meta in metas.splitlines():
            key, value = re.match(r"(.*?):\s*(.*)",
                                  meta.replace(":", ":")).groups()
            key = convert_to_underline(key)
            if key not in self._meta_parsers:
                self._error("Can not find the parser '%s' !" % key)
            tmp[key] = self._meta_parsers[key].parse(value)
        for meta_name, meta_obj in self._meta_parsers.items():
            if meta_obj.is_necessary() and meta_name not in tmp:
                if meta_obj.default == None:
                    self._error("Meta '%s' is necessary !" % meta_name)
                else:
                    tmp[meta_name] = meta_obj.default
        tmp["file"] = self._file_path
        return tmp

    def parse(self, file_path):
        logger.info("Parsing start: %s" % file_path)

        self._file_path = file_path
        with open("%s/%s" % (config["content_path"], file_path)) as f:
            text = f.read()

        result = self._split_meta_and_content(text)
        if not result:
            self._error("Article does not have meta and content !")
        metas, content = result
        metas += "\n%s:%s" % ("Category", file_path.split("/")[-2])
        return {
            "metadata": self._meta_parse(metas),
            "content": self._markdown_parser.parse(content)
        }

    def _error(self, message):
        line = "%s\nFile: %s" % (message, self._file_path)
        logger.error(line)
        raise
Ejemplo n.º 4
0
    def __init__(self):
        self._markdown_parser = MarkdownParser()

        self._meta_parsers = {}
        for c in get_all_classes(["meta_parsers.py"], MetaDataParser):
            obj = c()
            self._meta_parsers[obj.flag] = obj

        self._file_path = ""
Ejemplo n.º 5
0
class TestMarkdownParserCombined(unittest.TestCase):
    def setUp(self):
        self.md_parser = MarkdownParser()

    def test_all_types(self):
        md_code = dedent('''\
          # Header

          This should be a **paragraph**, with some *italics*, as well.

          What if you wanted to [go to the *store*?](https://www.youtube.com/watch?v=iRZ2Sh5-XuM)

          ```css
          body {
            color: black;
          }
          ```

          ---

          That should have shown some CSS regarding the `body` and changing the text `color`.

          ## List of things this can do so far

          1. Blocks
            * Headers
            * Paragraphs
            * Stuff
          1. Formatting
            - Strong
            - Code
              1. Inline
              5. Block
          1. Data

          Table | Header | Row
          --- | --- | ---
          Cell 1 | Cell2 | Cell 3 || with pipes

          ![Cat](http://1.bp.blogspot.com/-Flgz-X52Sa8/T-xaP9vmUZI/AAAAAAAABBg/B8pL7lpfd8w/s1600/newsitemoet.jpeg)''')
        html_code = dedent('''\
          <h1>Header</h1>
          <p>This should be a <strong>paragraph</strong>, with some <em>italics</em>, as well.</p>
          <p>What if you wanted to <a href="https://www.youtube.com/watch?v=iRZ2Sh5-XuM">go to the <em>store</em>?</a></p>
          <pre data-code-lang="css">body {
            color: black;
          }
          </pre>
          <hr>
          <p>That should have shown some CSS regarding the <code>body</code> and changing the text <code>color</code>.</p>
          <h2>List of things this can do so far</h2>
          <ol><li>Blocks<ul><li>Headers</li><li>Paragraphs</li><li>Stuff</li></ul></li><li>Formatting<ul><li>Strong</li><li>Code<ol><li>Inline</li><li>Block</li></ol></li></ul></li><li>Data</li></ol>
          <table><thead><tr><th scope="col">Table</th><th scope="col">Header</th><th scope="col">Row</th></tr></thead><tbody><tr><td>Cell 1</td><td>Cell2</td><td>Cell 3 || with pipes</td></tr></tbody></table>
          <img src="http://1.bp.blogspot.com/-Flgz-X52Sa8/T-xaP9vmUZI/AAAAAAAABBg/B8pL7lpfd8w/s1600/newsitemoet.jpeg" alt="Cat" title="Cat">''')

        self.assertEqual(html_code, self.md_parser.parse(md_code))
Ejemplo n.º 6
0
class TestMarkdownParserWhitespace(unittest.TestCase):
    def setUp(self):
        self.md_parser = MarkdownParser()

    def test_adjacent_lines(self):
        self.assertEqual('<p>Hello<br>World<br>Test</p>', self.md_parser.parse(dedent('''\
            Hello
            World
            Test''')))

    def test_separated_lines(self):
        self.assertEqual(dedent('''\
            <p>Hello</p>
            <p>World</p>
            <p>Test</p>'''), self.md_parser.parse(dedent('''\
            Hello

            World

            Test''')))

    def test_html_escaping(self):
        self.assertEqual('<p>Me &amp; Bobby McGee &lt;&gt;</p>',
                         self.md_parser.parse('Me & Bobby McGee <>'))
Ejemplo n.º 7
0
    def add_document(self, writer, file_path, config):
        file_name = unicode(file_path.replace(".", " ").replace("/", " ").replace("\\", " ").replace("_", " ").replace("-", " "), encoding="utf-8")
        # read file content
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            path = unicode(file_path, "utf-8")

        # parse markdown fields
        parser = MarkdownParser()
        parser.parse(content, config)

        modtime = os.path.getmtime(path)
        print "adding to index: path: %s size:%d tags:'%s' headlines:'%s' modtime=%d" % (
            path, len(content), parser.tags, parser.headlines, modtime)
        writer.add_document(
            path=path
            , filename=file_name
            , headlines=parser.headlines
            , tags=parser.tags
            , content=content
            , doubleemphasiswords=parser.doubleemphasiswords
            , emphasiswords=parser.emphasiswords
            , time = modtime
        )
Ejemplo n.º 8
0
 def setUp(self):
     self.md_parser = MarkdownParser()
Ejemplo n.º 9
0
class TestMarkdownParserSingleLines(unittest.TestCase):
    def setUp(self):
        self.md_parser = MarkdownParser()

    def test_header(self):
        self.assertEqual('<h6>Tiny header</h6>', self.md_parser.parse('###### Tiny header'))

    def test_header_no_spaces(self):
        self.assertEqual('<h3>Slush</h3>', self.md_parser.parse('###Slush'))

    def test_hash_not_at_start(self):
        self.assertNotEqual('Sm <h1>ush</h1>', self.md_parser.parse('Sm # ush'))

    def test_bold(self):
        self.assertEqual('<p><strong>The whole line</strong></p>', self.md_parser.parse('**The whole line**'))
        self.assertEqual('<p><strong>The whole line</strong></p>', self.md_parser.parse('__The whole line__'))

    def test_bold_inline(self):
        self.assertEqual('<p>Somewhere in the <strong>middle</strong> of the line</p>', self.md_parser.parse('Somewhere in the **middle** of the line'))
        self.assertEqual('<p>Somewhere in the <strong>middle</strong> of the line</p>', self.md_parser.parse('Somewhere in the __middle__ of the line'))

    def test_italic(self):
        self.assertEqual('<p><em>The whole line</em></p>', self.md_parser.parse('*The whole line*'))
        self.assertEqual('<p><em>The whole line</em></p>', self.md_parser.parse('_The whole line_'))

    def test_italic_inline(self):
        self.assertEqual('<p>Somewhere in the <em>middle</em> of the line</p>',
                         self.md_parser.parse('Somewhere in the *middle* of the line'))
        self.assertEqual('<p>Somewhere in the <em>middle</em> of the line</p>',
                         self.md_parser.parse('Somewhere in the _middle_ of the line'))

    def test_strikethrough(self):
        self.assertEqual('<p><s>The whole line</s></p>', self.md_parser.parse('~~The whole line~~'))

    def test_strikethrough_inline(self):
        self.assertEqual('<p>Somewhere in the <s>middle</s> of the line</p>',
                         self.md_parser.parse('Somewhere in the ~~middle~~ of the line'))

    def test_code(self):
        self.assertEqual('<p><code>The whole line</code></p>', self.md_parser.parse('`The whole line`'))

    def test_code_inline(self):
        self.assertEqual('<p>Somewhere in the <code>middle</code> of the line</p>',
                         self.md_parser.parse('Somewhere in the `middle` of the line'))

    def test_code_inline_triple_backticks(self):
        self.assertEqual('<p>Somewhere in the <code>middle **yeah** `the` middle</code> of the line</p>',
                         self.md_parser.parse('Somewhere in the ```middle **yeah** `the` middle``` of the line'))

    def test_code_inline_no_inner_parse(self):
        self.assertEqual('<p><code>[middle](href.com)</code></p>',
                         self.md_parser.parse('`[middle](href.com)`'))

    def test_link(self):
        self.assertEqual('<p><a href="example.com">The whole line</a></p>',
                         self.md_parser.parse('[The whole line](example.com)'))

    def test_link_inline(self):
        self.assertEqual('<p>Somewhere in the <a href="http://www.example.com">middle</a> of the line</p>',
                         self.md_parser.parse('Somewhere in the [middle](http://www.example.com) of the line'))

    def test_link_simple(self):
        self.assertEqual('<p><a href="example.com">example.com</a></p>',
                         self.md_parser.parse('<example.com>'))

    def test_link_simple_inline(self):
        self.assertEqual('<p>Somewhere in the <a href="http://www.example.com">http://www.example.com</a> of the line</p>',
                         self.md_parser.parse('Somewhere in the <http://www.example.com> of the line'))
        self.assertEqual('<p><a href="cousin_folder/faraway.html">cousin_folder/faraway.html</a></p>',
                         self.md_parser.parse('<cousin_folder/faraway.html>'))

    def test_image(self):
        # Won't do inline images, only as whole line
        self.assertEqual('<img src="https://duckduckgo.com/assets/logo_homepage.alt.v108.svg" alt="The whole line" title="The whole line">',
                         self.md_parser.parse('![The whole line](https://duckduckgo.com/assets/logo_homepage.alt.v108.svg)'))
Ejemplo n.º 10
0
class TestMarkdownParserBlocks(unittest.TestCase):
    def setUp(self):
        self.md_parser = MarkdownParser()

    def test_horizontal_rule(self):
        self.assertEqual('<hr>', self.md_parser.parse('---'))
        self.assertEqual('<hr>', self.md_parser.parse('___'))
        self.assertEqual('<hr>', self.md_parser.parse('==='))
        self.assertEqual('<hr>', self.md_parser.parse('***'))
        self.assertEqual('<hr>', self.md_parser.parse('---------------      '))
        self.assertNotEqual('<hr>', self.md_parser.parse('====****----____   '))

    def test_code_block(self):
        md_code = dedent('''\
            ```
            This is some code
            that should be wrapped
            into one chunk
            [This shouldn't be parsed](google.com)
            ```''')
        html_code = dedent('''\
            <pre>This is some code
            that should be wrapped
            into one chunk
            [This shouldn't be parsed](google.com)
            </pre>''')
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_code_block_lang(self):
        md_code = dedent('''\
            ```css
            This is some code
            that should be wrapped
            into one chunk
            [This shouldn't be parsed](google.com)
            ```''')
        html_code = dedent('''\
            <pre data-code-lang="css">This is some code
            that should be wrapped
            into one chunk
            [This shouldn't be parsed](google.com)
            </pre>''')
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_code_block_indent(self):
        md_code = dedent('''\
                This is some code
                that should be wrapped
                into one chunk
            reference''')
        html_code = dedent('''\
            <pre>This is some code
            that should be wrapped
            into one chunk
            </pre>
            <p>reference</p>''')
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_code_block_whitespace(self):
        md_code = dedent('''\
            ```
            This
              should indent
                    however we want it to
             a
            ```''')
        html_code = dedent('''\
            <pre>This
              should indent
                    however we want it to
             a
            </pre>''')
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_code_block_indent_whitespace(self):
        md_code = dedent('''\
                This
                  should indent
                        however we want it to
                 a
            reference''')
        html_code = dedent('''\
            <pre>This
              should indent
                    however we want it to
             a
            </pre>
            <p>reference</p>''')
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_blockquote(self):
        self.assertEqual('<blockquote><p>This is a whole line.</p></blockquote>',
                         self.md_parser.parse('> This is a whole line.'))

    def test_blockquote_multiline(self):
        md_code = dedent('''\
            > This is a quote
            > that spans two lines''')
        html_code = '<blockquote><p>This is a quote that spans two lines</p></blockquote>'
        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_list_asterisk(self):
        md_code = dedent('''\
            * List
            * Item
            ''')

        html_code = '<ul><li>List</li><li>Item</li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_list_hyphen(self):
        md_code = dedent('''\
            - List
            - Item
            ''')

        html_code = '<ul><li>List</li><li>Item</li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_list_indent(self):
        md_code = dedent('''\
            - List
            - Item
              - Sub
            ''')

        html_code = '<ul><li>List</li><li>Item' \
                      '<ul><li>Sub</li>' \
                      '</ul></li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_list_indent_deep(self):
        md_code = dedent('''\
            - List
              - Item
                - Sub
            - Back
            ''')

        html_code = '<ul><li>List' \
                      '<ul><li>Item' \
                        '<ul><li>Sub</li></ul>' \
                      '</li></ul>' \
                    '</li><li>Back</li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_list_indent_mixed(self):
        md_code = dedent('''\
            * List
            - Item
              - Sub
              * Item
                - One more
            * And back
            ''')

        html_code = '<ul><li>List</li><li>Item' \
                      '<ul><li>Sub</li><li>Item' \
                        '<ul><li>One more</li></ul></li>' \
                      '</ul></li>' \
                    '<li>And back</li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_ordered_list(self):
        md_code = dedent('''\
            1. List
            2. Item
            ''')
        html_code = '<ol><li>List</li><li>Item</li></ol>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_ordered_list_mixed(self):
        md_code = dedent('''\
            1. List
            1. Item
            54. Out of order
            ''')
        html_code = '<ol><li>List</li><li>Item</li><li>Out of order</li></ol>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_ordered_list_indent(self):
        md_code = dedent('''\
            1. some
            2. stuff with
            3. lots of
              1. c
              1. h
              1. a
              1. r
                1. s
            ''')
        html_code = '<ol><li>some</li><li>stuff with</li><li>lots of' \
                      '<ol><li>c</li><li>h</li><li>a</li><li>r' \
                        '<ol><li>s</li></ol>' \
                      '</li></ol>' \
                    '</li></ol>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_unordered_and_ordered_lists(self):
        md_code = dedent('''\
            1. some
            202. stuff with
            13. lots of
              * c
              * h
                - a
                  1. r
              - s
            1. number continuation
            ''')
        html_code = '<ol><li>some</li><li>stuff with</li><li>lots of' \
                      '<ul><li>c</li><li>h' \
                        '<ul><li>a' \
                          '<ol><li>r</li>' \
                        '</ol></li>' \
                      '</ul></li><li>s</li></ul>' \
                    '</li><li>number continuation</li></ol>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_checkboxes(self):
        md_code = dedent('''
            - [ ] Unchecked
            - [x] Checked
              - [X] Checked, too!
            ''')
        html_code = '<ul><li><input type="checkbox"> Unchecked</li><li><input type="checkbox" checked> Checked<ul><li><input type="checkbox" checked> Checked, too!</li></ul></li></ul>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))

    def test_tables(self):
        md_code = dedent('''\
            this | is a | header
            --- | --- | ---
            1 | pipe test||| | and text 
            2 | pipes | are cool! 
            ''')
        html_code = '<table>' \
                      '<thead><tr><th scope="col">this</th><th scope="col">is a</th><th scope="col">header</th></tr></thead>' \
                      '<tbody><tr><td>1</td><td>pipe test|||</td><td>and text</td></tr>' \
                      '<tr><td>2</td><td>pipes</td><td>are cool!</td></tr></tbody>' \
                    '</table>'

        self.assertEqual(html_code, self.md_parser.parse(md_code))
Ejemplo n.º 11
0
class Parser(object):
    """
    Parser for blog page files.
    """

    def __init__(self):
        self._markdown_parser = MarkdownParser()

        self._meta_parsers = {}
        for c in get_all_classes(["meta_parsers.py"], MetaDataParser):
            obj = c()
            self._meta_parsers[obj.flag] = obj

        self._file_path = ""

    def _split_meta_and_content(self, text):
        tmp = re.match(
            r"([\s\S]*?)\n\n([\s\S]*)",
            text.replace("\r\n", "\n")
        )
        if not tmp:
            return False
        return tmp.groups()

    def _meta_parse(self, metas):
        tmp = {}
        for meta in metas.splitlines():
            key, value = re.match(r"(.*?):\s*(.*)", meta.replace(":", ":")).groups()
            key = convert_to_underline(key)
            if key not in self._meta_parsers:
                self._error("Can not find the parser '%s' !" % key)
            tmp[key] = self._meta_parsers[key].parse(value)
        if "authors" not in tmp:
            tmp["authors"] = config["default_authors"]
        for meta_name, meta_obj in self._meta_parsers.items():
            if meta_obj.is_necessary() and meta_name not in tmp:
                self._error("Meta '%s' is necessary !" % meta_name)
        tmp["file"] = self._file_path
        return tmp

    def parse(self, file_path):
        logger.info("Parsing start: %s" % file_path)

        self._file_path = file_path
        with open("%s/%s" % (config["content_path"], file_path)) as f:
            text = f.read()

        result = self._split_meta_and_content(text)
        if not result:
            self._error("Article does not have meta and content !")
        metas, content = result
        metas += "\n%s:%s" % ("Category", file_path.split("/")[-2])
        return {
            "metadata": self._meta_parse(metas),
            "content": self._markdown_parser.parse(content)
        }

    def _error(self, message):
        line = "%s\nFile: %s" % (message, self._file_path)
        logger.error(line)
        raise