def make_parsers():
    templates = {}
    allowed_tags = []
    allowed_self_closing_tags = []
    allowed_attributes = []
    interwiki = {}
    namespaces = {}

    from mediawiki_parser.preprocessor import make_parser
    preprocessor = make_parser(templates)

    from mediawiki_parser.html import make_parser
    parser = make_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces, internal_link_prefix=WIKI_BASE_URI)

    return preprocessor, parser
Beispiel #2
0
    def _grammar(self, templates):
        """Return a full or partial grammar.

        method_name -- If truthy, the attribute of the full grammar to return

        """
        return preprocessor.make_parser(templates)
Beispiel #3
0
    def _grammar(self, templates):
        """Return a full or partial grammar.

        method_name -- If truthy, the attribute of the full grammar to return

        """
        return preprocessor.make_parser(templates)
Beispiel #4
0
def mediawiki(pad):
    templates = {}
    allowed_tags = []
    allowed_self_closing_tags = []
    allowed_attributes = []
    interwiki = {}
    namespaces = {}

    preprocessor = make_parser(templates)

    parser = make_html_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces)

    preprocessed_text = preprocessor.parse(pad.content.decode("Utf-8"))
    return parser.parse(preprocessed_text.leaves()).value.replace("<body>", "").replace("</body>", "")
Beispiel #5
0
def testit(content):
    global foo
    templates = {}
    allowed_tags = ["PRE"]
    allowed_self_closing_tags = []
    allowed_attributes = []
    interwiki = {}
    namespaces = {}

    preprocess = preprocessor.make_parser(templates)

    parser = html.make_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces)
    #parser._setTopPattern('wikitext')

    #parser = raw.make_parser()

    preprocessed_text = preprocess.parseTest(content)
    #import pdb; pdb.set_trace()
    #Pattern.TRACE=True
    foo = parser.parseTest(preprocessed_text).leaves()
Beispiel #6
0
 def _preprocessor(self, templates):
     return preprocessor.make_parser(templates)
Beispiel #7
0
 def _preprocessor(self, templates):
     return preprocessor.make_parser(templates)
Beispiel #8
0
allowed_autoclose_tags = ['br', 'hr']
allowed_parameters = ['class', 'style', 'name', 'id', 'scope']
interwiki = {
    'en': 'http://en.wikipedia.org/wiki/',
    'fr': 'http://fr.wikipedia.org/wiki/'
}
namespaces = {
    'Template': 10,
    u'Catégorie': 14,
    'Category': 14,
    'File': 6,
    'Image': 6
}
parser = html.make_parser(allowed_tags, allowed_autoclose_tags,
                          allowed_parameters, interwiki, namespaces)
preprocessor_parser = preprocessor.make_parser({})
siteSubElem = lxml.html.fromstring(
    '<div class="siteSub">From Fakipedia, the fake Wikipedia</div><div class="contentSub"/>'
)


def preprocess(source):
    source = source.replace("\n ", "\n") \
                  .replace(" \n", "\n") \
                  .replace("= ", "=") \
                  .replace(" =", "=") \
                  .replace("@ ", " ") \
                  .replace(" @", " ") \
                  .strip()
    source_split = source.split("\n")
    # fixing title
Beispiel #9
0
            if line == "":
                break
            while line[0] == " ":
                line = line[1:]
            if line == "</page>\n":
                temp_page += line
                n += 1
                pages.append(temp_page)
                # w = Wikipedia(temp_page)
            elif line == "<page>\n":
                temp_page = line
            else:
                temp_page += line
    return pages


# 3176788 pages
if __name__ == "__main__":
    fn = "zhwiki"
    data = chunky_read(fn)
    dics = []
    for item in data:
        json = dumps(bf.data(fromstring(item)))
        dic = loads(json)
        dics.append(dic)
    d = dics[2]
    text = d["page"]["revision"]["text"]["$"]
    templates = {}
    preprocessor = make_parser(templates)

    output = preprocessor.parse(text)
Beispiel #10
0
def parse_data_to_markup(source, dest, format_='yaml',
                         template='standard_entry.md.jinja'):
    """Given the path to a source data file and a destination, turn the source
    file into a Python dictionary and then pass it to a Jinja template, writing
    to the destination.

    Args:
        source (file): File-like object to read and parse data from.
        dest (file): File-like object to write the rendered template to.

    Kwargs:
        format (string): What format the source file is in. Default assumption
            is `yaml`.
        template (string): Name of the template we should read and then render.
    """
    data = None

    if format_ == 'yaml':
        with open(source, 'r') as f:
            data = yaml.load(f)
    elif format_ == 'hjson':
        import hjson
        with open(source, 'r') as f:
            data = hjson.load(f)
    elif format_ == 'cfg':
        # config parser needs the most... massging
        config = ConfigParser.RawConfigParser()
        config.read(source)

        data = config.items('trip')
        data = dict(map(lambda x: (x[0], x[1].replace('\\n', '\n')), data))

        guests = map(lambda x: x[1], config.items('guests'))
        data['guest_list'] = guests
    elif format_ == 'plist':
        import plistlib
        data = plistlib.readPlist(source)
    elif format_ == 'wiki':
        from mediawiki_parser.html import make_parser as make_parser_html
        from mediawiki_parser.preprocessor import make_parser
        preprocessor = make_parser({})

        parser = make_parser_html([], [], [], {}, {})

        with open(source, 'r') as f:
            preprocessed_text = preprocessor.parse(f.read())

        output = parser.parse(preprocessed_text.leaves())

        dest.write(output.value)

        return
    else:
        raise RuntimeError("No usable format given to data parser!")

    loader = jinja2.FileSystemLoader('tools/templates')
    env = jinja2.Environment(loader=loader)

    template = env.get_template(template)

    data['source'] = source

    dest.write(template.render(**data))