def store_pages_info(pages_info: List[str], id: str, output_dir: str, start_page: int = 1): for page_num, page_info in enumerate(pages_info, start_page): store_json(page_info, f"{output_dir}/{id}_page{page_num}.json") print(f"[INFO] Storing page{page_num}")
"id": 9, "selectTerm": "all" }, page_count=1) store_pages_info(pages_info, id, OUTPUT_RAW_DIR) return pages_info def tag_current_legislator_in_db(names: List[str]) -> None: """Write current_legislator column. Notice: won't change others to False """ print(f"current legislators: {names}") query = Candidate.update(current_legislator=True).where( Candidate.name.in_(names)) query.execute() if __name__ == "__main__": history_legislator_info_pages = run_history_legislator_info_pages() current_legislator_info_pages = run_current_legislator_info_pages() legislators_info = get_legislators_info(history_legislator_info_pages, current_legislator_info_pages) store_json(legislators_info, f"{OUTPUT_TRANSFORMED_DIR}/legislator_info.json") current_legislator_names = get_current_legislator_names( current_legislator_info_pages) tag_current_legislator_in_db(current_legislator_names)
def store_pages_info(pages_info: List[str], id: str, output_dir: str): for page_num, page_info in enumerate(pages_info, 1): store_json(page_info, f'{output_dir}/{id}_page{page_num}.json')
def get_page_links(page_name): section = get_external_link_section(page_name) payload = { 'action': 'parse', 'format': 'json', 'page': page_name, 'prop': 'externallinks', 'utf8': '', 'section': section } response_data = json.loads(_send_request(payload)) try: return response_data['parse'] except KeyError: print(f'[WARRNING] {page_name} does not have key "parse"') return {'title': page_name} if __name__ == "__main__": page_names = get_page_list() with Pool(processes=4) as pool: pages_links = pool.map(get_page_links, page_names) pages_links_string = json.dumps(pages_links, ensure_ascii=False) store_json(pages_links_string, OUTPUT_RAW) # with open(OUTPUT_RAW) as fp: # pages_links = json.load(fp) store_json(json.dumps(transform(pages_links), ensure_ascii=False), OUTPUT_TRANSFORMED)
from os import environ, path from legislative_yuan_open_data import (scrap_legislator_info_pages, store_pages_info) from transform import transform from util import store_json FILE_DIR = path.dirname(path.abspath(__file__)) OUTPUT_RAW_DIR = environ.get('OUTPUT_RAW_DIR', f'{FILE_DIR}/../../data/raw') OUTPUT_TRANSFORMED_DIR = environ.get('OUTPUT_TRANSFORMED_DIR', f'{FILE_DIR}/../../data/organized') ID = 'sitting_info' if __name__ == "__main__": pages_info = scrap_legislator_info_pages(ID, payload_base={ 'id': 42, 'selectTerm': 'all' }, page_count=6) store_pages_info(pages_info, ID, OUTPUT_RAW_DIR) sitting_info = transform(pages_info) store_json(sitting_info, f'{OUTPUT_TRANSFORMED_DIR}/sitting_info.json')