Example #1
0
def parseString(title=None, raw=None, wikidb=None, revision=None,
                lang=None, magicwords=None, expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'

    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title,)
    input_raw = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input_raw = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language

        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
Example #2
0
def parse_txt(txt, xopts=None, **kwargs):
    if xopts is None:
        xopts = XBunch(**kwargs)
    else:
        xopts.__dict__.update(**kwargs)

    if xopts.expander is None:
        from mwlib.expander import Expander, DictDB
        xopts.expander = Expander("", "pagename", wikidb=DictDB())

    if xopts.nshandler is None:
        xopts.nshandler = nshandling.get_nshandler_for_lang(xopts.lang or 'en')

    xopts.imagemod = util.ImageMod(xopts.magicwords)

    uniquifier = xopts.uniquifier
    if uniquifier is None:
        uniquifier = uniq.Uniquifier()
        txt = uniquifier.replace_tags(txt)
        xopts.uniquifier = uniquifier

    tokens = tokenize(txt, uniquifier=uniquifier)

    td2 = tagparser()
    a = td2.add

    a("code", 10)
    a("span", 20)

    a("li", 25, blocknode=True, nested=False)
    a("dl", 28, blocknode=True)
    a("dt", 26, blocknode=True, nested=False)
    a("dd", 26, blocknode=True, nested=True)

    td1 = tagparser()
    a = td1.add
    a("blockquote", 5)
    a("references", 15)

    a("p", 30, blocknode=True, nested=False)
    a("ul", 35, blocknode=True)
    a("ol", 40, blocknode=True)
    a("center", 45, blocknode=True)

    td_parse_h = tagparser()
    for i in range(1, 7):
        td_parse_h.add("h%s" % i, i)

    parsers = [
        fixlitags, mark_style_tags, parse_singlequote, parse_preformatted, td2,
        parse_paragraphs, td1, parse_lines, parse_div, parse_links, parse_urls,
        parse_inputbox, td_parse_h, parse_sections, remove_table_garbage,
        fix_tables, parse_tables, parse_uniq, fix_named_url_double_brackets,
        fix_break_between_pre
    ]

    combined_parser(parsers)(tokens, xopts)
    return tokens
Example #3
0
class nuwiki(object):
    def __init__(self, path, allow_pickle=False):
        self.path = os.path.abspath(path)
        d = os.path.join(self.path, "images", "safe")
        if not os.path.exists(d):
            try:
                os.makedirs(d)
            except OSError, exc:
                if exc.errno != 17: # file exists
                    raise
            
        self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", []))            

        self.revisions = {}
        self._read_revisions()

        fn = os.path.join(self.path, 'authors.db')
        if not os.path.exists(fn):
            self.authors = None
            log.warn('no authors present. parsing revision info instead')
        else:
            self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'html.db')
        if not os.path.exists(fn):
            self.html = self.extractHTML(self._loadjson("parsed_html.json", {}))
            log.warn('no html present. parsing revision info instead')
        else:
            self.html = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'imageinfo.db')
        if not os.path.exists(fn):
            self.imageinfo = self._loadjson("imageinfo.json", {})
            log.warn('loading imageinfo from pickle')
        else:
            self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle)

        self.redirects = self._loadjson("redirects.json", {})
        self.siteinfo = self._loadjson("siteinfo.json", {})
        self.nshandler = nshandling.nshandler(self.siteinfo)        
        self.en_nshandler = nshandling.get_nshandler_for_lang('en') 
        self.nfo = self._loadjson("nfo.json", {})

        self.set_make_print_template()
Example #4
0
    def __init__(self, path, allow_pickle=False):
        self.path = os.path.abspath(path)
        d = os.path.join(self.path, "images", "safe")
        if not os.path.exists(d):
            try:
                os.makedirs(d)
            except OSError as exc:
                if exc.errno != 17:  # file exists
                    raise

        self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", []))

        self.revisions = {}
        self._read_revisions()

        fn = os.path.join(self.path, 'authors.db')
        if not os.path.exists(fn):
            self.authors = None
            log.warn('no authors present. parsing revision info instead')
        else:
            self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'html.db')
        if not os.path.exists(fn):
            self.html = self.extractHTML(self._loadjson("parsed_html.json", {}))
            log.warn('no html present. parsing revision info instead')
        else:
            self.html = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'imageinfo.db')
        if not os.path.exists(fn):
            self.imageinfo = self._loadjson("imageinfo.json", {})
            log.warn('loading imageinfo from pickle')
        else:
            self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle)

        self.redirects = self._loadjson("redirects.json", {})
        self.siteinfo = self._loadjson("siteinfo.json", {})
        self.nshandler = nshandling.nshandler(self.siteinfo)
        self.en_nshandler = nshandling.get_nshandler_for_lang('en')
        self.nfo = self._loadjson("nfo.json", {})

        self.set_make_print_template()
Example #5
0
def empty():
    empty = core.XBunch()
    empty.nshandler = nshandling.get_nshandler_for_lang('de')
    return empty
Example #6
0
def empty():
    empty = core.XBunch()
    empty.nshandler = nshandling.get_nshandler_for_lang('de')
    return empty
Example #7
0
File: fetch.py Project: ingob/mwlib
 def _get_nshandler(self):
     if self._nshandler is not None:
         return self._nshandler
     return nshandling.get_nshandler_for_lang('en') # FIXME
Example #8
0
def parseString(title=None,
                raw=None,
                wikidb=None,
                revision=None,
                lang=None,
                magicwords=None,
                expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'
    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title, )
    input = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language
        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input,
                         title=title,
                         wikidb=wikidb,
                         nshandler=nshandler,
                         lang=lang,
                         magicwords=magicwords,
                         uniquifier=uniquifier,
                         expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
Example #9
0
def parse_txt(txt, xopts=None, **kwargs):
    if xopts is None:
        xopts = XBunch(**kwargs)
    else:
        xopts.__dict__.update(**kwargs)

    if xopts.expander is None:
        from mwlib.expander import Expander, DictDB
        xopts.expander = Expander("", "pagename", wikidb=DictDB())

    if xopts.nshandler is None:
        xopts.nshandler = nshandling.get_nshandler_for_lang(xopts.lang or 'en')

    xopts.imagemod = util.ImageMod(xopts.magicwords)

    uniquifier = xopts.uniquifier
    if uniquifier is None:
        uniquifier = uniq.Uniquifier()
        txt = uniquifier.replace_tags(txt)
        xopts.uniquifier = uniquifier

    tokens = tokenize(txt, uniquifier=uniquifier)

    td2 = tagparser()
    a = td2.add

    a("code", 10)
    a("span", 20)

    a("li", 25, blocknode=True, nested=False)
    a("dl", 28, blocknode=True)
    a("dt", 26, blocknode=True, nested=False)
    a("dd", 26, blocknode=True, nested=True)

    td1 = tagparser()
    a = td1.add
    a("blockquote", 5)
    a("references", 15)

    a("p", 30, blocknode=True, nested=False)
    a("ul", 35, blocknode=True)
    a("ol", 40, blocknode=True)
    a("center", 45, blocknode=True)

    td_parse_h = tagparser()
    for i in range(1, 7):
        td_parse_h.add("h%s" % i, i)

    parsers = [fixlitags,
               mark_style_tags,
               parse_singlequote,
               parse_preformatted,
               td2,
               parse_paragraphs,
               td1,
               parse_lines,
               parse_div,
               parse_links,
               parse_urls,
               parse_inputbox,
               td_parse_h,
               parse_sections,
               remove_table_garbage,
               fix_tables,
               parse_tables,
               parse_uniq,
               fix_named_url_double_brackets,
               fix_break_between_pre]

    combined_parser(parsers)(tokens, xopts)
    return tokens
Example #10
0
def test_localized_redirect_matcher():
    m = nshandling.get_nshandler_for_lang("de").redirect_matcher
    assert m("#REDIRECT [[Data structure]]") == "Data structure",  "bad redirect"
    assert m("#WEITERLEITUNG [[Data structure]]") == "Data structure",  "bad redirect"
Example #11
0
def test_redirect_matcher():
    m = nshandling.get_nshandler_for_lang("en").redirect_matcher
    assert m("#REDIRECT [[Data structure#Active data structures]]") == "Data structure",  "bad redirect"
Example #12
0
 def _get_nshandler(self):
     if self._nshandler is not None:
         return self._nshandler
     return nshandling.get_nshandler_for_lang('en')  # FIXME
Example #13
0
def test_redirect_matcher():
    m = nshandling.get_nshandler_for_lang("en").redirect_matcher
    assert m("#REDIRECT [[Data structure#Active data structures]]"
             ) == "Data structure", "bad redirect"
Example #14
0
def test_localized_redirect_matcher():
    m = nshandling.get_nshandler_for_lang("de").redirect_matcher
    assert m(
        "#REDIRECT [[Data structure]]") == "Data structure", "bad redirect"
    assert m("#WEITERLEITUNG [[Data structure]]"
             ) == "Data structure", "bad redirect"