def get_kiso(filter_regex): wiki_text = get_text() pattern = r'{{基礎情報(.+)\n}}\n' compiled_string = re.compile(pattern, flags=(re.MULTILINE | re.DOTALL)) finded_result = compiled_string.findall(wiki_text) if filter_regex != None: replaced_str = re.sub(filter_regex, "", finded_result[0]) else: replaced_str = finded_result[0] finded_list = replaced_str.split('\n|') result_dict = {} for s in finded_list: each_dict = s.split(" = ") if len(each_dict) > 1: result_dict[each_dict[0]] = each_dict[1] else: result_dict["self"] = each_dict[0] return result_dict
import re from knock20 import get_text # 正規表現 # MULTILINE: 複数行マッチング # DOTALL: .を改行以外のあらゆる文字と解釈する # 1. \|(.+?) |***から始まる pattern = re.compile(r'^\|(.+?)\s=\s(.+?)(?=\n(\||\}))', re.MULTILINE | re.DOTALL) basic_info = {} s = get_text() for match in pattern.finditer(s): basic_info[match.group(1)] = match.group(2) for (key, value) in basic_info.items(): print("{}: {}".format(key, value))
import re from knock20 import get_text # 正規表現 # 1. 行頭が"[[Category:"で始まる # 2. (.+?)は任意の文字列を表す # 3. (\|.+)?で"|***"となる部分を0以上繰り返す # 4. 最後に]]で閉じられる pattern = re.compile(r'^\[\[Category:(.+?)(\|.+)?\]\]$') for s in get_text().split("\n"): # 各行で該当箇所を探す text = pattern.search(s) # 該当箇所が見つかった場合 if text is not None: # 1つ目(上の説明で言う2に該当する部分)を抜き出す print(text.group(1))
import re from knock20 import get_text wiki_text = get_text() level_dict = {} for i in range(2,6): regex_str = '\n={%d}([^=]+)={%d}\n' % (i,i) level_list = re.findall(regex_str, wiki_text) level_dict[i] = level_list print(level_dict)