def maru(): text = read20.file_reading() template_dictionary = {} basic_information = re.search(r'^\{\{基礎情報\s国\n(.*)\n\}\}$', text, re.MULTILINE + re.DOTALL) basic_data = basic_information.group(1) basic_data = re.split('\n(?=\|)', basic_data) for line in basic_data: data = re.match(r'^\|(.+?)\s*\=\s*(.*)', line, re.DOTALL) #??????????????? if data == None: continue else: key = re.sub('\|', '', data.group(1)) key = re.sub('\s$', '', key) value = re.sub('(\'{2,4})', '', data.group(2)) if re.search(r'\[\[.+\]\]', value): value = re.sub('[\[\]#\|]', '', value) if re.search(r'\{\{.+\}\}', value): value = re.sub('[{}]', '', value) pattern = re.compile(r'\[http:([^]]*)\]') value = pattern.sub(r'', value) template_dictionary[key] = value return template_dictionary
#記事のカテゴリ名を(行単位ではなく名前で)抽出せよ. #カテゴリの名前だけ?とってくるの? import read20 import re text = read20.file_reading() text = text.split('\n') #一行ごとに分けた """ def CategoryName(line): name = re.match(r'\[\[Category:(.*)\]\]', line) name_group = name.group(1).split('|') return name_group[0] """ def category_name(line): name = re.search(r'(?<=Category:)\w+', line) return name.group() for line in text: if re.match(r'\[\[Category:.+\]\]', line): print(category_name(line)) #[ぁ-んァ-ン一-龥0-90−9a-zA-Z・] = \w