Beispiel #1
0
def makeparsers(force=False):
    from pijnu import makeParser
    import os
    inputFile = "preprocessor.pijnu"
    outputFile = os.path.join("mediawiki_parser", "preprocessorParser.py")
    if force or inputNewer(inputFile, outputFile):
        preprocessorGrammar = file(inputFile).read()
        makeParser(preprocessorGrammar, outputPath="mediawiki_parser")

    inputFile = "mediawiki.pijnu"
    outputFile = os.path.join("mediawiki_parser", "wikitextParser.py")
    if force or inputNewer(inputFile, outputFile):
        mediawikiGrammar = file(inputFile).read()
        makeParser(mediawikiGrammar, outputPath="mediawiki_parser")
Beispiel #2
0
def parseTest():
    numbersTransformGrammar = """\
numbersTransform
<toolset>
def toReal(node):
	node.value = float(node.value)
<definition>
SEP			: ' '						: drop
DOT			: '.'
digit		: [0..9]
integer		: digit+					: join
real		: integer DOT integer?		: join
number		: real / integer			: toReal
addedNum	: SEP number				: liftNode
numbers		: number (addedNum)*		: extract
"""
    makeParser(numbersTransformGrammar)
def parseTest():
    numbersTransformGrammar = """\
numbersTransform
<toolset>
def toReal(node):
	node.value = float(node.value)
<definition>
SEP			: ' '						: drop
DOT			: '.'
digit		: [0..9]
integer		: digit+					: join
real		: integer DOT integer?		: join
number		: real / integer			: toReal
addedNum	: SEP number				: liftNode
numbers		: number (addedNum)*		: extract
"""
    makeParser(numbersTransformGrammar)
Beispiel #4
0
    def test_choice_regex(self):
        """Make sure the Choice-optimize-as-Regexp works"""
        numbers_transform_grammar = """\
test_choice_regex
<toolset>
def to_int(node):
    node.value = int(node.value)
<definition>
    SEP            : ' '                        : drop
    digit          : [0..4] / [5..9]            : liftNode
    integer        : digit+                     : join
    number         : integer                    : to_int
    addedNumber    : SEP number                 : liftNode
    numbers        : number (addedNumber)*      : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser()
        source = "12 3 5"
        result = "[number:'12'  number:'3'  number:'5']"
        self.assertEquals(unicode(parser.parseTest(source).value), result)
Beispiel #5
0
    def test_choice_regex(self):
        """Make sure the Choice-optimize-as-Regexp works"""
        numbers_transform_grammar = """\
test_choice_regex
<toolset>
def to_int(node):
    node.value = int(node.value)
<definition>
    SEP            : ' '                        : drop
    digit          : [0..4] / [5..9]            : liftNode
    integer        : digit+                     : join
    number         : integer                    : to_int
    addedNumber    : SEP number                 : liftNode
    numbers        : number (addedNumber)*      : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser()
        source = "12 3 5"
        result = "[number:'12'  number:'3'  number:'5']"
        self.assertEquals(py3compat.text_type(parser.parseTest(source).value), result)
Beispiel #6
0
    def test_custom_toolset(self):
        """Make sure we can use a custom toolset inside the grammar."""
        numbers_transform_grammar = """\
test_custom_toolset_numbers_transform
<toolset>
def to_real(node):
    node.value = float(node.value)
<definition>
    SEP        : ' '                   : drop
    DOT        : '.'
    digit      : [0..9]
    integer    : digit+                : join
    real       : integer DOT integer?  : join
    number     : real / integer        : to_real
    addedNum   : SEP number            : liftNode
    numbers    : number (addedNum)*    : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser()
        source = "1 3.141592 5"
        result = "[integer:'1.0'  real:'3.141592'  integer:'5.0']"
        self.assertEquals(unicode(parser.parseTest(source).value), result)
Beispiel #7
0
    def test_custom_toolset(self):
        """Make sure we can use a custom toolset inside the grammar."""
        numbers_transform_grammar = """\
test_custom_toolset_numbers_transform
<toolset>
def to_real(node):
    node.value = float(node.value)
<definition>
    SEP        : ' '                   : drop
    DOT        : '.'
    digit      : [0..9]
    integer    : digit+                : join
    real       : integer DOT integer?  : join
    number     : real / integer        : to_real
    addedNum   : SEP number            : liftNode
    numbers    : number (addedNum)*    : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser()
        source = "1 3.141592 5"
        result = "[integer:'1.0'  real:'3.141592'  integer:'5.0']"
        self.assertEquals(unicode(parser.parseTest(source).value), result)
Beispiel #8
0
    def test_external_toolset(self):
        """Make sure we can pass a custom toolset from the make_parser call."""
        def to_real(node):
            node.value = float(node.value)

        numbers_transform_grammar = """\
test_external_toolset_numbers_transform
<definition>
    SEP        : ' '                   : drop
    DOT        : '.'
    digit      : [0..9]
    integer    : digit+                : join
    real       : integer DOT integer?  : join
    number     : real / integer        : to_real
    addedNum   : SEP number            : liftNode
    numbers    : number (addedNum)*    : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser({'to_real': to_real})
        source = "1 3.141592 5"
        result = "[integer:'1.0'  real:'3.141592'  integer:'5.0']"
        self.assertEquals(py3compat.text_type(parser.parseTest(source).value),
                          result)
Beispiel #9
0
    def test_external_toolset(self):
        """Make sure we can pass a custom toolset from the make_parser call."""

        def to_real(node):
            node.value = float(node.value)

        numbers_transform_grammar = """\
test_external_toolset_numbers_transform
<definition>
    SEP        : ' '                   : drop
    DOT        : '.'
    digit      : [0..9]
    integer    : digit+                : join
    real       : integer DOT integer?  : join
    number     : real / integer        : to_real
    addedNum   : SEP number            : liftNode
    numbers    : number (addedNum)*    : extract
"""
        make_parser = makeParser(numbers_transform_grammar)
        parser = make_parser({"to_real": to_real})
        source = "1 3.141592 5"
        result = "[integer:'1.0'  real:'3.141592'  integer:'5.0']"
        self.assertEquals(py3compat.text_type(parser.parseTest(source).value), result)
Beispiel #10
0
# -*- coding: utf8 -*-

from __future__ import print_function
import time
import codecs

print("*** Parsing to HTML ***")

start_time = time.time()

# get the parser
from pijnu import makeParser
preprocessorGrammar = open("preprocessor.pijnu").read()
makeParser(preprocessorGrammar)

mediawikiGrammar = open("mediawiki.pijnu").read()
makeParser(mediawikiGrammar)

allowed_tags = ['p', 'span', 'b', 'i', 'small', 'center']
allowed_autoclose_tags = ['br']
allowed_parameters = ['class', 'style', 'name', 'id', 'scope']
interwiki = {
    'ar': 'http://ar.wikipedia.org/wiki/',
    'az': 'http://az.wikipedia.org/wiki/',
    'br': 'http://br.wikipedia.org/wiki/',
    'ca': 'http://ca.wikipedia.org/wiki/',
    'cs': 'http://cs.wikipedia.org/wiki/',
    'da': 'http://da.wikipedia.org/wiki/',
    'de': 'http://de.wikipedia.org/wiki/',
    'en': 'http://en.wikipedia.org/wiki/',
    'eo': 'http://eo.wikipedia.org/wiki/',
Beispiel #11
0
# -*- coding: utf8 -*-

import time
import codecs

print "*** Parsing to HTML ***"

start_time = time.time()

# get the parser
from pijnu import makeParser
preprocessorGrammar = file("preprocessor.pijnu").read()
makeParser(preprocessorGrammar)

mediawikiGrammar = file("mediawiki.pijnu").read()
makeParser(mediawikiGrammar)

allowed_tags = ['p', 'span', 'b', 'i', 'small', 'center']
allowed_autoclose_tags = ['br']
allowed_parameters = ['class', 'style', 'name', 'id', 'scope']
interwiki = {'ar': 'http://ar.wikipedia.org/wiki/',
             'az': 'http://az.wikipedia.org/wiki/',
             'br': 'http://br.wikipedia.org/wiki/',
             'ca': 'http://ca.wikipedia.org/wiki/',
             'cs': 'http://cs.wikipedia.org/wiki/',
             'da': 'http://da.wikipedia.org/wiki/',
             'de': 'http://de.wikipedia.org/wiki/',
             'en': 'http://en.wikipedia.org/wiki/',
             'eo': 'http://eo.wikipedia.org/wiki/',
             'es': 'http://es.wikipedia.org/wiki/',
             'fr': 'http://fr.wikipedia.org/wiki/'}
Beispiel #12
0
def setup_module():
    preprocessorGrammar = file("preprocessor.pijnu").read()
    makeParser(preprocessorGrammar)

    mediawikiGrammar = file("mediawiki.pijnu").read()
    makeParser(mediawikiGrammar)
def setup_module():
    preprocessorGrammar = file("preprocessor.pijnu").read()
    makeParser(preprocessorGrammar)

    mediawikiGrammar = file("mediawiki.pijnu").read()
    makeParser(mediawikiGrammar)
def convertFromWikiToHTML(infile):

    #print "Converting from Wiki to HTML: ", infile

    #print "Loading dependences."
    # get the parser
    preprocessorGrammar = file("mediaWikiDependences/preprocessor.pijnu").read()
    makeParser(preprocessorGrammar)

    mediawikiGrammar = file("mediaWikiDependences/mediawiki.pijnu").read()
    makeParser(mediawikiGrammar)

    allowed_tags = ['p', 'span', 'b', 'i', 'small', 'center', 'ref', 'gallery']
    allowed_autoclose_tags = ['br', 'hr']
    allowed_parameters = ['class', 'style', 'name', 'id', 'scope']
    interwiki = {'ar': 'http://ar.wikipedia.org/wiki/',
                 'az': 'http://az.wikipedia.org/wiki/',
                 'zh': 'http://zh.wikipedia.org/wiki/',
                 'it': 'http://it.wikipedia.org/wiki/',
                 'jp': 'http://jp.wikipedia.org/wiki/',
                 'ja': 'http://ja.wikipedia.org/wiki/',
                 'jv': 'http://jv.wikipedia.org/wiki/',
                 'pt': 'http://pt.wikipedia.org/wiki/',
                 'br': 'http://br.wikipedia.org/wiki/',
                 'ca': 'http://ca.wikipedia.org/wiki/',
                 'cs': 'http://cs.wikipedia.org/wiki/',
                 'da': 'http://da.wikipedia.org/wiki/',
                 'de': 'http://de.wikipedia.org/wiki/',
                 'en': 'http://en.wikipedia.org/wiki/',
                 'eo': 'http://eo.wikipedia.org/wiki/',
                 'es': 'http://es.wikipedia.org/wiki/',
                 'simple': 'http://simple.wikipedia.org/wiki/',
                 'sco': 'http://sco.wikipedia.org/wiki/',
                 'ro': 'http://ro.wikipedia.org/wiki/',
                 'pl': 'http://pl.wikipedia.org/wiki/',
                 'kk': 'http://kk.wikipedia.org/wiki/',
                 'fr': 'http://fr.wikipedia.org/wiki/'}

    namespaces = {'Template':   10,
                  u'Catégorie': 14,
                  'Category':   14,
                  'File':        6,
                  'Fichier':     6,
                  'Image':       6}
    templates = {'listen': u"""{| style="text-align:center; background: #f9f9f9; color: #000;font-size:90%; line-height:1.1em; float:right;clear:right; margin:1em 1.5em 1em 1em; width:300px; border: 1px solid #aaa; padding: 0.1em;" cellspacing="7"
#! class="media audio" style="background-color:#ccf; line-height:3.1em" | Fichier audio
#|-
#|<span style="height:20px; width:100%; padding:4pt; padding-left:0.3em; line-height:2em;" cellspacing="0">'''[[Media:{{{filename|{{{nomfichier|{{{2|}}}}}}}}}|{{{title|{{{titre|{{{1|}}}}}}}}}]]''' ''([[:Fichier:{{{filename|{{{nomfichier|{{{2|}}}}}}}}}|info]])''<br /><small>{{{suitetexte|{{{description|}}}}}}</small>
#<center>[[Fichier:{{{filename|{{{nomfichier|{{{2|}}}}}}}}}|noicon]]</center></span><br /><span style="height:20px; width:100%; padding-left:0.3em;" cellspacing="0"><span title="Des difficultés pour écouter le fichier ?">[[Image:Circle question mark.png|14px|link=Aide:Écouter des sons ogg|Des difficultés  pour  écouter le fichier ?]] ''[[Aide:Écouter des sons ogg|Des problèmes pour écouter le fichier ?]]''</span>
#|}
#""",'3e': '3<sup>e</sup>'}

    #print "Importing preprocessor parser."
    from preprocessor import make_parser
    preprocessor = make_parser(templates)

    #print "Importing html parser."
    from html import make_html_parser
    parser = make_html_parser(allowed_tags, allowed_autoclose_tags, allowed_parameters, interwiki, namespaces)
    
    #from text import make_parser
    #parser = make_parser()

    # import the source in a utf-8 string
    #print "Importing codecs."
    import codecs
    fileObj = codecs.open(infile, "r", "utf-8")
    source = fileObj.read()

    # The last line of the file will not be parsed correctly if
    # there is no newline at the end of file, so, we add one.
    if source[-1] != '\n':
      source += '\n'

    #print "Processing text"
    preprocessed_text = preprocessor.parse(source)
    #print "Parsing tree"
    tree = parser.parse(preprocessed_text.leaves())

    #output = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
    #<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="fr">
    #head><title>Test!</title></head>""" + tree.leaves() + "</html>"
    #print "Take output"
    output = tree.leaves()

    #print "Encode utf8 and Returns"
    return output.encode('UTF-8')
Beispiel #15
0
from __future__ import print_function
"""
Cloning a Choice pattern raises a TypeError exception.

It might also happen with other types of patterns.
"""

from pijnu import makeParser
numbers_transform_grammar = """
test_custom_toolset_numbers_transform
<toolset>
def to_real(node):
    node.value = float(node.value)
<definition>
    SEP        : ' '                   : drop
    DOT        : '.'
    digit      : [0..9]
    integer    : digit+                : join
    integer2   : integer
    real       : integer DOT integer?  : join
    number     : real / integer        : to_real
    number2    : number
    addedNum   : SEP number            : liftNode
    numbers    : number (addedNum)*    : extract
"""
make_parser = makeParser(numbers_transform_grammar)
parser = make_parser()
print(parser.parseTest('1 2 3'))
Beispiel #16
0
"""
Reading the grammar as UTF-8 raises an exception
"""

from pijnu import makeParser
import codecs
fileObj = codecs.open("bug3.pijnu", "r", "utf-8")
grammar = fileObj.read()
make_parser = makeParser(grammar)
parser = make_parser()
print parser.parseTest('1 2 3')
Beispiel #17
0
def generateParserModule(grammer):
    pijnu.makeParser(grammer)
def generateParserModule(grammer):
    pijnu.makeParser(grammer)