def convert_to_strings(wikipage):
    # given a wikipage object, the function will return a structurlized
    # dictionary that holds all information from a wikipage.
    from hanziconv import HanziConv
    import wikitextparser as wtp
    import pprint
    try:
        summary = HanziConv.toTraditional(
            wtp.parse(wikipage.content).sections[0].pprint())
    except:
        summary = None
    try:
        sections = [HanziConv.toTraditional(
            sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]]
        try:
            sub_titles = [HanziConv.toTraditional(
                sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]]
        except:
            sub_titles = None
        try:
            section_content = [s[s.find('\n') + 1:] for s in sections]
        except:
            section_content = None
    except:
        sections = None

    try:
        sections = list(zip(sub_titles, section_content))
    except:
        sections = None
    try:
        links = wikipage.links
    except:
        links = None
    return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
Exemple #2
0
def extract_cc(wt):
    def get_ccs(link):
        r = []
        for i in [ link.target, link.text ]:
            if i and i.strip().startswith('+'):
                r.append([ x.strip()[1:].replace(' ', '') for x in i.split(',') ])
        r = sorted(r, key=lambda t : (t.__len__(), t[0].__len__()))
        if r:
            return r[-1]
        else:
            return []

    h = {}
    section = None
    for x in wt.sections:
        if 'Alphabetical listing' in x.title:
            section = x
    rows = section.tables[0].getdata()[1:]
    for row in rows:
        for link in wtp.parse(row[1]).wikilinks:
            ccs = get_ccs(link)
            for cc in ccs:
                if cc:
                    if cc in h:
                        if row[0] in h[cc]:
                            continue
                        h[cc] = h[cc] + '/' + row[0]
                    else:
                      h[cc] = row[0]
    h = sorted(h.items(), key=lambda t : t[0])
    return h
Exemple #3
0
    def save_page(wikipage):
        wt_parsed = wtp.parse(wikipage.wikitext)
        categories = []

        # ### parse categories from wikilinks
        cat_link_started = False  # the category links may not be the last few
        for link in reversed(wt_parsed.wikilinks):
            if link.target.startswith("Category:"):
                cat_link_started = True
                categories.append(link.target.replace("Category:", ""))
            else:
                if cat_link_started:
                    break
        # ###

        new_article = WikipediaArticle(
            title=wikipage.title,
            pageid=int(wikipage.pageid),

            wikitext=wikipage.wikitext,
            content=wikipage.content,
            summary=wikipage.summary,

            categories=categories,
            wikilinks=[WikiLink(target=l.target, text=l.text) for l in
                       wt_parsed.wikilinks],

            # sections=wt_parsed.sections,
        )
        new_article.save()
        return new_article
Exemple #4
0
def run(args):
    with open(args.output, 'w') as output:
        print('''-- Lookup tables for various mobile telecommunications related codes
--
-- Autogenerated with tel-code-gen.py
''', file = output)
        if args.all:
            args.cc      = True
            args.cur     = True
            args.alpha3  = True
            args.carrier = True
            args.area    = True
            args.mcc     = True
            args.mnc     = True
        if args.cur:
            iso_cur_map = aggregate_iso_cur()
            pp_lua_iso_cur_map(iso_cur_map, output)
        if args.alpha3:
            iso_alpha3_map = aggregate_iso_alpha3()
            pp_lua_iso_alpha3_map(iso_alpha3_map, output)
        if args.carrier:
            carrier_code_map = mk_carrier_code_map()
            pp_lua_carrier_code_map(carrier_code_map, output)
        if args.area:
            area_code_map = mk_area_code_map()
            pp_lua_area_code_map(area_code_map, output)
        if args.cc:
            t = fetch('cc.wtext', wp_url.format('List_of_country_calling_codes'),
                    args)
            wu = wtp.parse(t)
            cc_map = extract_cc(wu)
            pp_lua_cc_map(cc_map, output, args)
        if args.mcc or args.mnc:
            s = fetch('mcc.wtext', wp_url.format('Mobile_country_code'), args)
            wt = wtp.parse(s)
            rows = extract_tables_wt(wt)
            if args.mcc:
                mcc_map = aggregate_mcc(rows)
                mcc_to_country_map = mk_mcc_to_country_map(mcc_map)
                pp_lua_mcc_map(mcc_to_country_map, output)
            if args.mnc:
                mcc_mnc_map = aggregate_mcc_mnc(rows)
                pp_lua_mcc_mnc_map(mcc_mnc_map, output)
Exemple #5
0
def format_operator(s):
    r = ''
    p = wtp.parse(s)
    if p.wikilinks.__len__() >= 1:
        t = p.wikilinks[0].text
        if t:
            r = t
        else:
            r = p.wikilinks[0].target
    elif p.external_links.__len__() >= 1:
        r = p.external_links[0].text
    else:
        r = s
    r = re.sub('<[^>]*>', '', r)
    r = r.replace('&amp;', '&').strip()
    return r
def get_subtitle(sentence_df: pd.DataFrame, wiki_dump_data: list):
    df = sentence_df.assign(heading = '')
    new_train_df = pd.DataFrame()
    for _id in df._id.unique():
        article_df = df.loc[df._id == _id]
        
        row_article = [entry for entry in wiki_dump_data if entry['index']['_id'] == _id][0]
        parsed = wtp.parse(row_article['source_text'])
        for source in parsed.sections[1:]:
            heading = _search_subtitle(source.string)
            section_text = _clean_source_text(source)
            article_df = _get_subtitle_of_sentence(article_df, section_text, heading)
        
        article_df = _complement_subtitle(article_df)
        new_train_df = new_train_df.append(article_df)

    return new_train_df
def test_tabs_in_heading():
    """Test that insert parses the inserted part."""
    t = '=\tt\t=\t'
    assert str(parse(t).sections[1]) == t
def test_repr():
    assert repr(parse('')) == "WikiText('')"
def test_starting_boundary():
    assert not parse('turn:a').external_links
def test_ignore_head_apostrophes():
    b, = parse("''''''''a").get_italics()
    assert b.string == "'''''a"
def test_single_bold_italic():
    i, = parse("'''''a").get_italics()
    assert i.text == "'''a"
def test_multiline_italics():
    a, b = parse("'''a''\n'''b''").get_italics()
    assert a.string == "''a''"
    assert b.string == "''b''"
def test_first_space_condition_in_doquotes_not_used():
    b, = parse("'''a'' '''b'' '''c''").get_bolds()
    assert b.string == "'''b'' '''"
def test_pre():  # 46
    assert len(parse('<pre></pre>').get_tags()) == 1
 def ab(s: str, o: str, r: bool = True):
     assert parse(s).get_bolds(r)[0].string == o
def test_self_closing():
    # extension tag
    assert parse('<references />').get_tags()[0].string == '<references />'
    # HTML tag
    assert parse('<s / >').get_tags()[0].string == '<s / >'
def test_inner_tag():
    parsed = parse('<br><s><b>sb</b></s>')
    s = parsed.get_tags('s')[0]
    assert s.string == '<s><b>sb</b></s>'
    assert s.get_tags()[0].string == '<b>sb</b>'
def test_assume_that_templates_do_not_exist():
    # this is actually an invalid <s> tag on English Wikipedia, i.e the
    # result of {{para}} makes it invalid.
    assert len(parse('<s {{para|a}}></s>').get_tags('s')) == 1
def test_section_templates():
    """section.templates returns templates only from that section."""
    templates = parse('{{t1}}\n==section==\n{{t2}}').sections[1].templates
    assert len(templates) == 1
    assert templates[0].string == '{{t2}}'
def test_deleting_a_section_wont_corrupt_others():
    z, a, b, c = parse('=a=\na\n==b==\nb\n==c==\nc').sections
    del b.string
    assert c.string == '==c==\nc'
def test_do_not_return_duplicate_bolds_italics():  # 42
    assert len(parse("{{a|{{b|'''c'''}}}}").get_bolds()) == 1
    assert len(parse("[[file:a.jpg|[[b|''c'']]]]").get_italics()) == 1
 def anb(s: str):
     assert not parse(s).get_bolds(True)
def test_do_not_include_end_tag():
    assert parse('<div>[http://a]</div>').plain_text() == ''
def test_extension_tags():
    a, b = parse('<ref/><ref/>')._extension_tags
    assert a._extension_tags == []
def test_first_single_letter_word_condition_in_doquotes():
    b, = parse("'''a'' b'''c'' '''d''").get_bolds()
    assert b.string == "'''a'' b'''c'' '''"
 def ai(s: str, o: str, r: bool = True):
     italics = parse(s).get_italics(r)
     assert len(italics) == 1
     assert italics[0].string == o
def test_first_space_condition_in_balanced_quotes_shadow():
    b, = parse("a '''b'' '''c'' '''d''").get_bolds()
    assert b.string == "'''c'' '''"
Exemple #28
0
import wikitextparser as wtp
from wikitextparser import remove_markup, parse

file = open("./clean.mw")

string = file.read()

story_text = wtp.parse(string)

num = 0

for section in story_text.sections:
    string = section.string
    string = string.replace('== Plot ==', '')
    filename = "story" + str(num) + ".txt"
    text_file = open(filename, "w")
    text_file.write(string)
    text_file.close()
    num = num + 1
def test_bold_ends_4_apostrophes():
    b, = parse("''a'''b''''").get_bolds()
    assert b.text == "b'"
def test_italic_end_token():
    assert parse("''i''").get_italics(False)[0].end_token is True
def test_bolds_italics_span_data_reuse():
    p = parse("'''b''' ''i''")
    b0, i0 = p.get_bolds_and_italics()
    b1, i1 = p.get_bolds_and_italics()
    assert i0._span_data is i1._span_data
    assert b0._span_data is b1._span_data
 def ap(s, p):
     assert parse(s).plain_text() == p
def test_with_nowiki():
    assert parse('[http://a.b <nowiki>[c]</nowiki>]').external_links[0].text \
           == '<nowiki>[c]</nowiki>'
def test_plain_text_should_not_mutate():  # 40
    p = parse('[[a]][[b]]')
    a, b = p.wikilinks
    assert a.plain_text() == 'a'
    assert b.plain_text() == 'b'
def test_not_every_sooner_starting_span_is_a_parent():
    a, b = parse('[[a]][[b]]').wikilinks
    assert b.ancestors() == []
def test_multiline_with_carriage_return():
    s = 'text\r\n= s =\r\n{|\r\n| a \r\n|}\r\ntext'
    p = parse(s)
    assert 'text\r\n' == p.sections[0].string