Example #1
0
# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json
import problem25

def remove_emphasis(string):
    emphasis = re.compile(r"''('*)(.+)''\1")
    return emphasis.sub(r"\2", string)

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmEmpha.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_emphasis)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)
Example #2
0
__author__ = 'todoroki'

import re
import json
import problem25
import problem26
import problem27

def remove_markup(string):
    markups = [
        re.compile(r"\[https?://[a-zA-Z0-9\./]+\s(.+)?\]"),
        re.compile(r"#REDIRECT\s?(.+)"),
        re.compile(r"<!--\s?(.+)\s?-->"),
        re.compile(r"\{\{.*[Ll]ang\|[a-zA-Z\-]+\|(.+)\}\}"),
        re.compile(r"(.*)<ref.+(</ref>)?>"),
        re.compile(r"(.*?)<br\s?/?>"),
        re.compile(r"<[a-z]+.*>(.*?)</[a-z]+>")
    ]
    removed_string = problem27.remove_internalLink(string)
    for m in markups:
        removed_string = m.sub(r"\1", removed_string)
    return removed_string

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmMUs.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_markup)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)
Example #3
0
# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json
import problem25
import problem26

def remove_internalLink(string):
    internallink = re.compile(r"\[\[((.+?)\|)?(.+?)\]\]")
    emphasis_removed = problem26.remove_emphasis(string)
    return internallink.sub(r"\3", emphasis_removed)

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmEmpha-rmLink.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_internalLink)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)