コード例 #1
0
def store_pages_info(pages_info: List[str],
                     id: str,
                     output_dir: str,
                     start_page: int = 1):
    for page_num, page_info in enumerate(pages_info, start_page):
        store_json(page_info, f"{output_dir}/{id}_page{page_num}.json")
        print(f"[INFO] Storing page{page_num}")
コード例 #2
0
                                                 "id": 9,
                                                 "selectTerm": "all"
                                             },
                                             page_count=1)
    store_pages_info(pages_info, id, OUTPUT_RAW_DIR)
    return pages_info


def tag_current_legislator_in_db(names: List[str]) -> None:
    """Write current_legislator column.

    Notice: won't change others to False
    """
    print(f"current legislators: {names}")
    query = Candidate.update(current_legislator=True).where(
        Candidate.name.in_(names))
    query.execute()


if __name__ == "__main__":
    history_legislator_info_pages = run_history_legislator_info_pages()
    current_legislator_info_pages = run_current_legislator_info_pages()
    legislators_info = get_legislators_info(history_legislator_info_pages,
                                            current_legislator_info_pages)
    store_json(legislators_info,
               f"{OUTPUT_TRANSFORMED_DIR}/legislator_info.json")

    current_legislator_names = get_current_legislator_names(
        current_legislator_info_pages)
    tag_current_legislator_in_db(current_legislator_names)
コード例 #3
0
def store_pages_info(pages_info: List[str], id: str, output_dir: str):
    for page_num, page_info in enumerate(pages_info, 1):
        store_json(page_info, f'{output_dir}/{id}_page{page_num}.json')
コード例 #4
0

def get_page_links(page_name):
    section = get_external_link_section(page_name)
    payload = {
        'action': 'parse',
        'format': 'json',
        'page': page_name,
        'prop': 'externallinks',
        'utf8': '',
        'section': section
    }
    response_data = json.loads(_send_request(payload))
    try:
        return response_data['parse']
    except KeyError:
        print(f'[WARRNING] {page_name} does not have key "parse"')
        return {'title': page_name}


if __name__ == "__main__":
    page_names = get_page_list()
    with Pool(processes=4) as pool:
        pages_links = pool.map(get_page_links, page_names)
    pages_links_string = json.dumps(pages_links, ensure_ascii=False)
    store_json(pages_links_string, OUTPUT_RAW)

    # with open(OUTPUT_RAW) as fp:
    #     pages_links = json.load(fp)
    store_json(json.dumps(transform(pages_links), ensure_ascii=False), OUTPUT_TRANSFORMED)
コード例 #5
0
from os import environ, path

from legislative_yuan_open_data import (scrap_legislator_info_pages,
                                        store_pages_info)
from transform import transform
from util import store_json

FILE_DIR = path.dirname(path.abspath(__file__))
OUTPUT_RAW_DIR = environ.get('OUTPUT_RAW_DIR', f'{FILE_DIR}/../../data/raw')
OUTPUT_TRANSFORMED_DIR = environ.get('OUTPUT_TRANSFORMED_DIR',
                                     f'{FILE_DIR}/../../data/organized')
ID = 'sitting_info'

if __name__ == "__main__":
    pages_info = scrap_legislator_info_pages(ID,
                                             payload_base={
                                                 'id': 42,
                                                 'selectTerm': 'all'
                                             },
                                             page_count=6)
    store_pages_info(pages_info, ID, OUTPUT_RAW_DIR)
    sitting_info = transform(pages_info)
    store_json(sitting_info, f'{OUTPUT_TRANSFORMED_DIR}/sitting_info.json')