Exemple #1
0
def parse_wiki():
    dic = {}
    shouldParse = False
    check = []
    for line in k20.read_jwc_json()['イギリス'].split('\n'):
        check.append('')
        if '基礎情報' in line:
            shouldParse = True
            continue
        if shouldParse:
            r = parse('|{} = {}', line)
            if r is None:
                if line == '}}':
                    break
                continue
            dic[r[0]] = re.sub('<.*>', '', re.sub('\'|\"|\[|\]', '', r[1]))
    return dic
Exemple #2
0
def extract_template(page_source=k20.read_jwc_json()['イギリス']):
    dic = {}
    should_parse = False
    check = []
    for line in page_source.split('\n'):
        check.append('')
        if '基礎情報' in line:
            should_parse = True
            continue
        if should_parse:
            r = parse('|{} = {}', line)
            if r is None:
                if line == '}}':
                    break
                continue
            dic[r[0]] = r[1]
    return dic
Exemple #3
0
import k20
import re
from pprint import pprint

# Windowsのファイル名禁止文字「\ / : * < > |」以外 が 1個以上 .(ピリオド)が1個 英字4個まで」の文字列を取ってくる。
sections = re.findall(':[^\\\/:\*\?\"\<\>\|]+\.[a-zA-Z]{,4}\|',
                      k20.read_jwc_json()['イギリス'])

pprint(sections)
Exemple #4
0
import k20
import re
from pprint import pprint

categories = re.findall('\[\[Category:.+\]\]', k20.read_jwc_json()['イギリス'])

cateWords = [re.sub('\]\]', '', word).split(':')[1] for word in categories]

pprint(cateWords)
# for cateWord in cateWords:
#     print(cateWord)
Exemple #5
0
import k20
from pprint import pprint
from parse import parse
import re

dic = {}
shouldParse = False
check = []
for line in k20.read_jwc_json()['イギリス'].split('\n'):
    check.append('')
    if '基礎情報' in line:
        shouldParse = True
        continue
    if shouldParse:
        r = parse('|{} = {}', line)
        if r is None:
            if line == '}}':
                break
            continue
        dic[r[0]] = re.sub('\'|\"', '', r[1])

pprint(dic)

# 強調マークアップは1行だけ存在した
Exemple #6
0
import k20
import re
from pprint import pprint

# 「=が2個以上 .(なんか文字)が1個以上 =が2個以上」の文字列を取ってくる。
sections = re.findall('={2,}.+={2,}', k20.read_jwc_json()['イギリス'])

pprint(sections)
section_and_level = {
    ''.join(re.findall('[^=]+', section)).strip():
    int(len(re.findall('=', section)) / 2 - 1)
    for section in sections
}

pprint(section_and_level)
# for sec, level in section_and_level.items():
#     print('N : {0:<10}, L : {1}'.format(sec, level))