# -*- coding: utf-8 -*- __author__ = 'todoroki' import re import json import problem25 def remove_emphasis(string): emphasis = re.compile(r"''('*)(.+)''\1") return emphasis.sub(r"\2", string) if __name__ == "__main__": inputfile = 'jawiki-england.txt' outputfile = 'jawiki-england_fundamental-rmEmpha.json' f = open(inputfile) res = problem25.fundamental_data(f, remove_emphasis) with open(outputfile, 'w') as g: json.dump(res, g, ensure_ascii=False)
__author__ = 'todoroki' import re import json import problem25 import problem26 import problem27 def remove_markup(string): markups = [ re.compile(r"\[https?://[a-zA-Z0-9\./]+\s(.+)?\]"), re.compile(r"#REDIRECT\s?(.+)"), re.compile(r"<!--\s?(.+)\s?-->"), re.compile(r"\{\{.*[Ll]ang\|[a-zA-Z\-]+\|(.+)\}\}"), re.compile(r"(.*)<ref.+(</ref>)?>"), re.compile(r"(.*?)<br\s?/?>"), re.compile(r"<[a-z]+.*>(.*?)</[a-z]+>") ] removed_string = problem27.remove_internalLink(string) for m in markups: removed_string = m.sub(r"\1", removed_string) return removed_string if __name__ == "__main__": inputfile = 'jawiki-england.txt' outputfile = 'jawiki-england_fundamental-rmMUs.json' f = open(inputfile) res = problem25.fundamental_data(f, remove_markup) with open(outputfile, 'w') as g: json.dump(res, g, ensure_ascii=False)
# -*- coding: utf-8 -*- __author__ = 'todoroki' import re import json import problem25 import problem26 def remove_internalLink(string): internallink = re.compile(r"\[\[((.+?)\|)?(.+?)\]\]") emphasis_removed = problem26.remove_emphasis(string) return internallink.sub(r"\3", emphasis_removed) if __name__ == "__main__": inputfile = 'jawiki-england.txt' outputfile = 'jawiki-england_fundamental-rmEmpha-rmLink.json' f = open(inputfile) res = problem25.fundamental_data(f, remove_internalLink) with open(outputfile, 'w') as g: json.dump(res, g, ensure_ascii=False)