def convert_to_strings(wikipage): # given a wikipage object, the function will return a structurlized # dictionary that holds all information from a wikipage. from hanziconv import HanziConv import wikitextparser as wtp import pprint try: summary = HanziConv.toTraditional( wtp.parse(wikipage.content).sections[0].pprint()) except: summary = None try: sections = [HanziConv.toTraditional( sec.pprint()) for sec in wtp.parse(wikipage.content).sections[1:]] try: sub_titles = [HanziConv.toTraditional( sec.title[1:-1]) for sec in wtp.parse(wikipage.content).sections[1:]] except: sub_titles = None try: section_content = [s[s.find('\n') + 1:] for s in sections] except: section_content = None except: sections = None try: sections = list(zip(sub_titles, section_content)) except: sections = None try: links = wikipage.links except: links = None return {'title': wikipage.title, 'summary': summary, 'sections': sections, 'links': links}
def extract_cc(wt): def get_ccs(link): r = [] for i in [ link.target, link.text ]: if i and i.strip().startswith('+'): r.append([ x.strip()[1:].replace(' ', '') for x in i.split(',') ]) r = sorted(r, key=lambda t : (t.__len__(), t[0].__len__())) if r: return r[-1] else: return [] h = {} section = None for x in wt.sections: if 'Alphabetical listing' in x.title: section = x rows = section.tables[0].getdata()[1:] for row in rows: for link in wtp.parse(row[1]).wikilinks: ccs = get_ccs(link) for cc in ccs: if cc: if cc in h: if row[0] in h[cc]: continue h[cc] = h[cc] + '/' + row[0] else: h[cc] = row[0] h = sorted(h.items(), key=lambda t : t[0]) return h
def save_page(wikipage): wt_parsed = wtp.parse(wikipage.wikitext) categories = [] # ### parse categories from wikilinks cat_link_started = False # the category links may not be the last few for link in reversed(wt_parsed.wikilinks): if link.target.startswith("Category:"): cat_link_started = True categories.append(link.target.replace("Category:", "")) else: if cat_link_started: break # ### new_article = WikipediaArticle( title=wikipage.title, pageid=int(wikipage.pageid), wikitext=wikipage.wikitext, content=wikipage.content, summary=wikipage.summary, categories=categories, wikilinks=[WikiLink(target=l.target, text=l.text) for l in wt_parsed.wikilinks], # sections=wt_parsed.sections, ) new_article.save() return new_article
def run(args): with open(args.output, 'w') as output: print('''-- Lookup tables for various mobile telecommunications related codes -- -- Autogenerated with tel-code-gen.py ''', file = output) if args.all: args.cc = True args.cur = True args.alpha3 = True args.carrier = True args.area = True args.mcc = True args.mnc = True if args.cur: iso_cur_map = aggregate_iso_cur() pp_lua_iso_cur_map(iso_cur_map, output) if args.alpha3: iso_alpha3_map = aggregate_iso_alpha3() pp_lua_iso_alpha3_map(iso_alpha3_map, output) if args.carrier: carrier_code_map = mk_carrier_code_map() pp_lua_carrier_code_map(carrier_code_map, output) if args.area: area_code_map = mk_area_code_map() pp_lua_area_code_map(area_code_map, output) if args.cc: t = fetch('cc.wtext', wp_url.format('List_of_country_calling_codes'), args) wu = wtp.parse(t) cc_map = extract_cc(wu) pp_lua_cc_map(cc_map, output, args) if args.mcc or args.mnc: s = fetch('mcc.wtext', wp_url.format('Mobile_country_code'), args) wt = wtp.parse(s) rows = extract_tables_wt(wt) if args.mcc: mcc_map = aggregate_mcc(rows) mcc_to_country_map = mk_mcc_to_country_map(mcc_map) pp_lua_mcc_map(mcc_to_country_map, output) if args.mnc: mcc_mnc_map = aggregate_mcc_mnc(rows) pp_lua_mcc_mnc_map(mcc_mnc_map, output)
def format_operator(s): r = '' p = wtp.parse(s) if p.wikilinks.__len__() >= 1: t = p.wikilinks[0].text if t: r = t else: r = p.wikilinks[0].target elif p.external_links.__len__() >= 1: r = p.external_links[0].text else: r = s r = re.sub('<[^>]*>', '', r) r = r.replace('&', '&').strip() return r
def get_subtitle(sentence_df: pd.DataFrame, wiki_dump_data: list): df = sentence_df.assign(heading = '') new_train_df = pd.DataFrame() for _id in df._id.unique(): article_df = df.loc[df._id == _id] row_article = [entry for entry in wiki_dump_data if entry['index']['_id'] == _id][0] parsed = wtp.parse(row_article['source_text']) for source in parsed.sections[1:]: heading = _search_subtitle(source.string) section_text = _clean_source_text(source) article_df = _get_subtitle_of_sentence(article_df, section_text, heading) article_df = _complement_subtitle(article_df) new_train_df = new_train_df.append(article_df) return new_train_df
def test_tabs_in_heading(): """Test that insert parses the inserted part.""" t = '=\tt\t=\t' assert str(parse(t).sections[1]) == t
def test_repr(): assert repr(parse('')) == "WikiText('')"
def test_starting_boundary(): assert not parse('turn:a').external_links
def test_ignore_head_apostrophes(): b, = parse("''''''''a").get_italics() assert b.string == "'''''a"
def test_single_bold_italic(): i, = parse("'''''a").get_italics() assert i.text == "'''a"
def test_multiline_italics(): a, b = parse("'''a''\n'''b''").get_italics() assert a.string == "''a''" assert b.string == "''b''"
def test_first_space_condition_in_doquotes_not_used(): b, = parse("'''a'' '''b'' '''c''").get_bolds() assert b.string == "'''b'' '''"
def test_pre(): # 46 assert len(parse('<pre></pre>').get_tags()) == 1
def ab(s: str, o: str, r: bool = True): assert parse(s).get_bolds(r)[0].string == o
def test_self_closing(): # extension tag assert parse('<references />').get_tags()[0].string == '<references />' # HTML tag assert parse('<s / >').get_tags()[0].string == '<s / >'
def test_inner_tag(): parsed = parse('<br><s><b>sb</b></s>') s = parsed.get_tags('s')[0] assert s.string == '<s><b>sb</b></s>' assert s.get_tags()[0].string == '<b>sb</b>'
def test_assume_that_templates_do_not_exist(): # this is actually an invalid <s> tag on English Wikipedia, i.e the # result of {{para}} makes it invalid. assert len(parse('<s {{para|a}}></s>').get_tags('s')) == 1
def test_section_templates(): """section.templates returns templates only from that section.""" templates = parse('{{t1}}\n==section==\n{{t2}}').sections[1].templates assert len(templates) == 1 assert templates[0].string == '{{t2}}'
def test_deleting_a_section_wont_corrupt_others(): z, a, b, c = parse('=a=\na\n==b==\nb\n==c==\nc').sections del b.string assert c.string == '==c==\nc'
def test_do_not_return_duplicate_bolds_italics(): # 42 assert len(parse("{{a|{{b|'''c'''}}}}").get_bolds()) == 1 assert len(parse("[[file:a.jpg|[[b|''c'']]]]").get_italics()) == 1
def anb(s: str): assert not parse(s).get_bolds(True)
def test_do_not_include_end_tag(): assert parse('<div>[http://a]</div>').plain_text() == ''
def test_extension_tags(): a, b = parse('<ref/><ref/>')._extension_tags assert a._extension_tags == []
def test_first_single_letter_word_condition_in_doquotes(): b, = parse("'''a'' b'''c'' '''d''").get_bolds() assert b.string == "'''a'' b'''c'' '''"
def ai(s: str, o: str, r: bool = True): italics = parse(s).get_italics(r) assert len(italics) == 1 assert italics[0].string == o
def test_first_space_condition_in_balanced_quotes_shadow(): b, = parse("a '''b'' '''c'' '''d''").get_bolds() assert b.string == "'''c'' '''"
import wikitextparser as wtp from wikitextparser import remove_markup, parse file = open("./clean.mw") string = file.read() story_text = wtp.parse(string) num = 0 for section in story_text.sections: string = section.string string = string.replace('== Plot ==', '') filename = "story" + str(num) + ".txt" text_file = open(filename, "w") text_file.write(string) text_file.close() num = num + 1
def test_bold_ends_4_apostrophes(): b, = parse("''a'''b''''").get_bolds() assert b.text == "b'"
def test_italic_end_token(): assert parse("''i''").get_italics(False)[0].end_token is True
def test_bolds_italics_span_data_reuse(): p = parse("'''b''' ''i''") b0, i0 = p.get_bolds_and_italics() b1, i1 = p.get_bolds_and_italics() assert i0._span_data is i1._span_data assert b0._span_data is b1._span_data
def ap(s, p): assert parse(s).plain_text() == p
def test_with_nowiki(): assert parse('[http://a.b <nowiki>[c]</nowiki>]').external_links[0].text \ == '<nowiki>[c]</nowiki>'
def test_plain_text_should_not_mutate(): # 40 p = parse('[[a]][[b]]') a, b = p.wikilinks assert a.plain_text() == 'a' assert b.plain_text() == 'b'
def test_not_every_sooner_starting_span_is_a_parent(): a, b = parse('[[a]][[b]]').wikilinks assert b.ancestors() == []
def test_multiline_with_carriage_return(): s = 'text\r\n= s =\r\n{|\r\n| a \r\n|}\r\ntext' p = parse(s) assert 'text\r\n' == p.sections[0].string