def parse_wiki(): dic = {} shouldParse = False check = [] for line in k20.read_jwc_json()['イギリス'].split('\n'): check.append('') if '基礎情報' in line: shouldParse = True continue if shouldParse: r = parse('|{} = {}', line) if r is None: if line == '}}': break continue dic[r[0]] = re.sub('<.*>', '', re.sub('\'|\"|\[|\]', '', r[1])) return dic
def extract_template(page_source=k20.read_jwc_json()['イギリス']): dic = {} should_parse = False check = [] for line in page_source.split('\n'): check.append('') if '基礎情報' in line: should_parse = True continue if should_parse: r = parse('|{} = {}', line) if r is None: if line == '}}': break continue dic[r[0]] = r[1] return dic
import k20 import re from pprint import pprint # Windowsのファイル名禁止文字「\ / : * < > |」以外 が 1個以上 .(ピリオド)が1個 英字4個まで」の文字列を取ってくる。 sections = re.findall(':[^\\\/:\*\?\"\<\>\|]+\.[a-zA-Z]{,4}\|', k20.read_jwc_json()['イギリス']) pprint(sections)
import k20 import re from pprint import pprint categories = re.findall('\[\[Category:.+\]\]', k20.read_jwc_json()['イギリス']) cateWords = [re.sub('\]\]', '', word).split(':')[1] for word in categories] pprint(cateWords) # for cateWord in cateWords: # print(cateWord)
import k20 from pprint import pprint from parse import parse import re dic = {} shouldParse = False check = [] for line in k20.read_jwc_json()['イギリス'].split('\n'): check.append('') if '基礎情報' in line: shouldParse = True continue if shouldParse: r = parse('|{} = {}', line) if r is None: if line == '}}': break continue dic[r[0]] = re.sub('\'|\"', '', r[1]) pprint(dic) # 強調マークアップは1行だけ存在した
import k20 import re from pprint import pprint # 「=が2個以上 .(なんか文字)が1個以上 =が2個以上」の文字列を取ってくる。 sections = re.findall('={2,}.+={2,}', k20.read_jwc_json()['イギリス']) pprint(sections) section_and_level = { ''.join(re.findall('[^=]+', section)).strip(): int(len(re.findall('=', section)) / 2 - 1) for section in sections } pprint(section_and_level) # for sec, level in section_and_level.items(): # print('N : {0:<10}, L : {1}'.format(sec, level))