def main(): wiki_docs = extract_wikidocs() for doc in wiki_docs: lines = doc['text'].split('\n') for line in lines: if re.search(r'\[\[Category:\S+\]\]', line): print(line)
def main(): wiki_docs = extract_wikidocs() for doc in wiki_docs: lines = doc['text'].split('\n') for line in lines: if re.search(ur'\[\[Category:.*?\]\]', line): print line.encode('utf-8')
def main(): docs = extract_wikidocs() pattern = re.compile(r'(File|ファイル):([^\|]+)') for doc in docs: # Find all markups File: or ファイル: references = pattern.findall(doc['text']) for ref in references: print(ref[1])
def main(): wiki_docs = extract_wikidocs() for doc in wiki_docs: lines = doc['text'].split('\n') for line in lines: categories = re.findall(ur'\[\[Category:(.+)\]\]', line) for match in categories: for cat in match.split('|'): if not re.search(ur'[\* ]', cat): print cat.encode('utf-8')
def main(): docs = extract_wikidocs() categories = [] for doc in docs: lines = doc['text'].split('\n') for line in lines: categories += (re.findall('\[\[Category:(\S+)\]\]', line)) for cat in categories: print(cat)
def parse_folder(): docs = extract_wikidocs() patern = re.compile('{{基礎情報.+?^}}', re.M | re.DOTALL) # M = mutiline dict_list = [] for doc in docs: matchs = patern.findall(doc['text']) for match in matchs: dict_list.append(parse_infobox(match)) return dict_list
def get_infobox(): docs = extract_wikidocs() objs_list = [] pattern = re.compile(ur'{{基礎情報.+?^}}\n', re.M | re.DOTALL) for doc in docs: matches = pattern.findall(doc['text']) for m in matches: dict_obj = parse_infobox(m) objs_list.append(dict_obj) return objs_list
def main(): wiki_docs = extract_wikidocs() pattern = re.compile(r'(={2,}) ([^=]+) (={2,})') for doc in wiki_docs: tuples = pattern.findall(doc['text']) for tp in tuples: pfx = tp[0] sfx = tp[2] sec = tp[1] orig = pfx + ' ' + sec + ' ' + sfx if len(pfx) != len(sfx): print('%s %s %s' % (pfx, sec, sfx)) exit level = len(pfx) - 1 print('%-40s Level %s\t\t%s' % (sec, level, orig))