def get_output_filepath(partner, period, **kwargs): safe_mkdir("data") directory = f"data/{partner.name.lower()}" safe_mkdir(directory) output_filepath = f"{directory}/{period}.json" # Idempotency check: Don't override existing (if messed up, you have to manually remove the file). if os.path.exists(output_filepath): return None return output_filepath
def list_database_codes(seed_db_codes, codes_to_raw_names, recurse_level=3): """ param orig_db_codes: starting list of db codes to scrape, e.g.: ["MEI", "QNA"] param code_to_name_map: to collect code to name mappings, e.g. "MEI: Main Economic Indicators Publication" param recurse_level: how many calls (including this one) should be called with new db codes. An alternative approach would be going though the list API, but getting the dataset codes is non-straightforward. # page_start = 0 # page_size = 200 # url = f"https://data.oecd.org/search-api/?hf={page_size}&b={page_start}&r=%2Bf%2Ftype%2Fdatasets%2Fapi+access&r=%2Bf%2Flanguage%2Fen&l=en&sl=sl_dp&sc=enabled%3Atrue%2Cautomatically_correct%3Atrue&target=st_dp" """ all_db_codes = set() new_db_codes = set(seed_db_codes) safe_mkdir("data/html") for i in range(1, recurse_level+1): this_iter_db_codes = set() LOGGER.info(f"Iter {i}: Iterating through {len(new_db_codes)} new db codes") for j, db_code in enumerate(new_db_codes): if (j + 1) % 100 == 0: LOGGER.info(f"Iter {i}: Parsed {j}/{len(new_db_codes)} so far.") # Cache the fetch results to disk as it's rather slow to fetch again. db_code_filename = f"data/html/{db_code}.html" url = f"https://stats.oecd.org/Index.aspx?DatasetCode={db_code}" soup = url_to_soup(url, db_code_filename) if soup is None: continue codes_to_raw_names[db_code] = soup.title.text.strip() for link in soup.find_all("a"): url = link.get("href") if url is not None: # Examples of URLs which we looking for: # OECDStat_Metadata/ShowMetadata.ashx?DataSet=ITF_INDICATORS # Index.aspx?DataSetCode=ITF_ROAD_ACCIDENTS match = re.match(r'.*(DataSet[^"]*).*', url) if match: this_iter_db_codes.add(match.group(1).split("=")[1]) new_db_codes = this_iter_db_codes.difference(all_db_codes) if len(new_db_codes) == 0: LOGGER.info(f"Iter {i}: No new db codes founds, returning the {len(all_db_codes)} db codes found.") return all_db_codes LOGGER.info(f"Iter {i}: Found {len(new_db_codes)} new db codes, first 10: {list(new_db_codes)[:10]}") all_db_codes.update(new_db_codes) LOGGER.info(f"Max recursion {recurse_level} reached, returning the {len(all_db_codes)} db codes found.") return list(all_db_codes)
# ------------------------------------------------------------------------------ # If this script runs with the "--test" option, change the numbers if len(sys.argv) >= 2 and "test" in sys.argv[1]: print("Test run") iteration_ns = [4, 4, 4, 4, 4, 4] case_n = 2 # ------------------------------------------------------------------------------ model = MNISTGenerator() weights_path = '../pretrained_weights/MNIST/model.ckpt-155421' model.load_model(weights_path) slider_length = getSliderLength(n, range_per_axis, 0.2) output_base_path = 'mnist_experiment_global/0/64_1024' safe_mkdir(output_base_path) for case_idx in range(case_n): print('------------- Start case #' + str(case_idx) + ' -------------') output_case_path = output_base_path + '/' + str(case_idx) safe_mkdir(output_case_path) target_latent_filepath = output_case_path + '/target_latent.txt' if os.path.isfile(target_latent_filepath): f = open(target_latent_filepath, 'r') _ = f.readline() data = f.readline().split(' ') target_latent = np.array(list(map(float, data))).reshape(n) f.close() else: target_latent = np.random.uniform(-1, 1, 64)
def save_model(self, epoch): """ save model to checkpoint_dir """ utils.safe_mkdir(self.checkpoint_dir) self.saver.save(self.sess, self.checkpoint_dir, epoch)
name = description_tag.text # The special case of stuffs like 2015M06_100, otherwise would run into: # Exception: For QnaReferenceperiodCodelist: tried to override 2010M12_100 with 1993M12_100 for M12_100 if name[:4].isdigit(): name = EXTRA_PREFIX + name value = code_tag.attrib["value"] # TODO(entity): Once we allow relationships between, use: code_tag.attrib.get("parentCode") model_to_name_to_values[model_name][name] = value return model_to_name_to_values enum_dir = "enums" safe_mkdir(enum_dir) # NOTE: Some Databases exist, but their schema is NOT present. E.g. MATERIAL_RESOURCES datasets = ["AEA", "AEI_OTHER", "AIR_GHG", "AV_AN_WAGE", "CITIES", "DUR_I", "EAG_NEAC", "EAG_TRANS", "GENDER_EMP", "GREEN_GROWTH", "FIN_IND_FBS", "HH_DASH", "IDD", "JOBQ", "LFS_SEXAGE_I_R", "MATERIAL_RESOURCES:", "MEI", "MEI_CLI", "MIG", "MSTI_PUB", "NAAG", "PDB_GR", "PDB_LV", "PNNI_NEW", "PPPGDP", "REV", "RS_GBL", "QNA", "SHA", "SNA_TABLE1", "SNA_TABLE5", "SOCX_AGG", "STLABOUR", "ULC_QUA", "WEALTH"] urls = [f"https://stats.oecd.org/restsdmx/sdmx.ashx/GetDataStructure/{dataset}" for dataset in datasets] # TODO: Maybe use explanation sites like https://data.oecd.org/fdi/fdi-stocks.htm to generate doc-strings # for values. More ideas can be found by searching https://data.oecd.org/searchresults/ # TODO: Might have useful docustrings: https://www.oecd.org/els/health-systems/List-of-variables-OECD-Health-Statistics-2018.pdf gather_and_generate_enums( urls=urls, output_filepath=f"{enum_dir}/all.py", parse_response=parse_oecd_schema_response, ignore_status_codes=[HTTPStatus.BAD_REQUEST], )
raise Exception(f"Non-200 status code: {response.status_code}: {response.text[:100]} url={url}") with open(filepath, "w") as output_file: if content_type == ContentType.JSON: # Put it through json load / dump to verify it's a correct json. dataset = json.loads(response.content) LOGGER.info(f"{dataset['header']}") json.dump(dataset, output_file) # output_file.write(str(response.content)) elif content_type == ContentType.CSV: output_file.write(response.text) else: Exception(f"Unexpected content type {content_type}") safe_mkdir("data") # ==== Initial run # db_code_manual_list = ["MEI", "MEI_CLI", "SNA", "HEALTH_STATE", "CRSNEW", "NAAG", "SHA", "STLABOUR", "SOCX_AGG", "MSTI_PUB", "CITIES", "QNA", "PDB_GR", "IDD", "MIG", "PDB_LV", "LFS_SEXAGE_I_R", "REV", "PNNI_NEW", "PPPGDP", "GREEN_GROWTH", "AEI_OTHER", "WEALTH", "ULC_QUA", "RS_GBL", "EAG_NEAC", "AEA", "DUR_I", "EAG_TRANS", "AV_AN_WAGE", "GENDER_EMP", "JOBQ", "HH_DASH", "IDO", "AIR_GHG", "FIN_IND_FBS", "MATERIAL_R"] # codes_to_raw_names = {} # list_database_codes(db_code_manual_list, codes_to_raw_names, 3) # # # Transform into enum names # name_to_values = {} # for code, raw_name in sorted(codes_to_raw_names.items()): # name_to_values[enumizy_name(raw_name)] = code # generate_enums({"DatabaseCode": name_to_values}, "enums/database_codes.py") # ==== Download all the data omnomnomnom. for year in range(2019, 2008, -1): LOGGER.info(f"Year: {year}")