Beispiel #1
0
def mk_information():
    txt = picup_eng_txt()
    txt = re.search(r'{{基礎情報.+?}}\n', txt, flags=re.DOTALL).group(0)
    txt = txt.split('\n')[1:]
    pattern = re.compile(r'\|(\w+) = ([^{]+)')
    pattern2 = re.compile(r'\|(\w+) = {{(\w+)')
    line = txt.pop(0)
    m = re.match(pattern, line)
    infomation_list = {}
    while m:
        infomation_list[m.group(1)] = get_rid_of_strong_markup(m.group(2))
        line = txt.pop(0)
        m = re.match(pattern, line)
        if not m:
            m2 = re.match(pattern2, line)
            if m2:
                v = ''
                line = txt.pop(0)
                m = re.match(pattern, line)
                while not m:
                    v += get_rid_of_strong_markup(line)
                    line = txt.pop(0)
                    m = re.match(pattern, line)
                infomation_list[m2.group(1)] = v
    return infomation_list
Beispiel #2
0
# -*- coding: utf-8 -*-
from q20 import picup_eng_txt
import re
from pprint import pprint

if __name__ == '__main__':
    txt = picup_eng_txt()
    txt = txt[txt.find('{{基礎情報 国'):]
    txt = txt.split('\n')[1:]
    pattern = re.compile(r'\|(\w+) = ([^{]+)')
    pattern2 = re.compile(r'\|(\w+) = {{(\w+)')
    line = txt.pop(0)
    m = re.match(pattern, line)
    infomation_list = {}
    while m:
        infomation_list[m.group(1)] = re.sub(r'<[^>]*>', '', m.group(2))
        line = txt.pop(0)
        m = re.match(pattern, line)
        if not m:
            m2 = re.match(pattern2, line)
            if m2:
                v = ''
                line = txt.pop(0)
                m = re.match(pattern, line)
                while not m:
                    v += re.sub(r'<[^>]*>', '', line.strip())
                    line = txt.pop(0)
                    m = re.match(pattern, line)
                infomation_list[m2.group(1)] = v

    pprint(infomation_list)
Beispiel #3
0
# -*- coding: utf-8 -*-
from q20 import picup_eng_txt
import re

if __name__ == '__main__':
    txt = picup_eng_txt().split('\n')
    pattern = r'\[\[Category:(\w+)\]\]'
    categpry_list = [
        t for t in filter(lambda x: re.search(pattern, x[1]) != None,
                          enumerate(txt))
    ]
    print(categpry_list)