コード例 #1
0
def get_output_filepath(partner, period, **kwargs):
    safe_mkdir("data")

    directory = f"data/{partner.name.lower()}"
    safe_mkdir(directory)

    output_filepath = f"{directory}/{period}.json"
    # Idempotency check: Don't override existing (if messed up, you have to manually remove the file).
    if os.path.exists(output_filepath):
        return None
    return output_filepath
コード例 #2
0
def list_database_codes(seed_db_codes, codes_to_raw_names, recurse_level=3):
    """
    param orig_db_codes: starting list of db codes to scrape, e.g.: ["MEI", "QNA"]
    param code_to_name_map: to collect code to name mappings, e.g. "MEI: Main Economic Indicators Publication"
    param recurse_level: how many calls (including this one) should be called with new db codes.

    An alternative approach would be going though the list API, but getting the dataset codes is non-straightforward.
    # page_start = 0
    # page_size = 200
    # url = f"https://data.oecd.org/search-api/?hf={page_size}&b={page_start}&r=%2Bf%2Ftype%2Fdatasets%2Fapi+access&r=%2Bf%2Flanguage%2Fen&l=en&sl=sl_dp&sc=enabled%3Atrue%2Cautomatically_correct%3Atrue&target=st_dp"
    """

    all_db_codes = set()
    new_db_codes = set(seed_db_codes)
    safe_mkdir("data/html")
    for i in range(1, recurse_level+1):
        this_iter_db_codes = set()
        LOGGER.info(f"Iter {i}: Iterating through {len(new_db_codes)} new db codes")
        for j, db_code in enumerate(new_db_codes):
            if (j + 1) % 100 == 0:
                LOGGER.info(f"Iter {i}: Parsed {j}/{len(new_db_codes)} so far.")
            # Cache the fetch results to disk as it's rather slow to fetch again.
            db_code_filename = f"data/html/{db_code}.html"
            url = f"https://stats.oecd.org/Index.aspx?DatasetCode={db_code}"
            soup = url_to_soup(url, db_code_filename)
            if soup is None:
                continue

            codes_to_raw_names[db_code] = soup.title.text.strip()
            for link in soup.find_all("a"):
                url = link.get("href")
                if url is not None:
                    # Examples of URLs which we looking for:
                    # OECDStat_Metadata/ShowMetadata.ashx?DataSet=ITF_INDICATORS
                    # Index.aspx?DataSetCode=ITF_ROAD_ACCIDENTS
                    match = re.match(r'.*(DataSet[^"]*).*', url)
                    if match:
                        this_iter_db_codes.add(match.group(1).split("=")[1])

        new_db_codes = this_iter_db_codes.difference(all_db_codes)
        if len(new_db_codes) == 0:
            LOGGER.info(f"Iter {i}: No new db codes founds, returning the {len(all_db_codes)} db codes found.")
            return all_db_codes

        LOGGER.info(f"Iter {i}: Found {len(new_db_codes)} new db codes, first 10: {list(new_db_codes)[:10]}")
        all_db_codes.update(new_db_codes)

    LOGGER.info(f"Max recursion {recurse_level} reached, returning the {len(all_db_codes)} db codes found.")
    return list(all_db_codes)
コード例 #3
0
# ------------------------------------------------------------------------------
# If this script runs with the "--test" option, change the numbers
if len(sys.argv) >= 2 and "test" in sys.argv[1]:
    print("Test run")
    iteration_ns = [4, 4, 4, 4, 4, 4]
    case_n = 2
# ------------------------------------------------------------------------------

model = MNISTGenerator()
weights_path = '../pretrained_weights/MNIST/model.ckpt-155421'
model.load_model(weights_path)

slider_length = getSliderLength(n, range_per_axis, 0.2)
output_base_path = 'mnist_experiment_global/0/64_1024'
safe_mkdir(output_base_path)

for case_idx in range(case_n):
    print('------------- Start case #' + str(case_idx) + ' -------------')
    output_case_path = output_base_path + '/' + str(case_idx)
    safe_mkdir(output_case_path)

    target_latent_filepath = output_case_path + '/target_latent.txt'
    if os.path.isfile(target_latent_filepath):
        f = open(target_latent_filepath, 'r')
        _ = f.readline()
        data = f.readline().split(' ')
        target_latent = np.array(list(map(float, data))).reshape(n)
        f.close()
    else:
        target_latent = np.random.uniform(-1, 1, 64)
コード例 #4
0
 def save_model(self, epoch):
     """
     save model to checkpoint_dir
     """
     utils.safe_mkdir(self.checkpoint_dir)
     self.saver.save(self.sess, self.checkpoint_dir, epoch)
コード例 #5
0
ファイル: generate_enums.py プロジェクト: Petrzlen/playground
            name = description_tag.text
            # The special case of stuffs like 2015M06_100, otherwise would run into:
            # Exception: For QnaReferenceperiodCodelist: tried to override 2010M12_100 with 1993M12_100 for M12_100
            if name[:4].isdigit():
                name = EXTRA_PREFIX + name

            value = code_tag.attrib["value"]

            # TODO(entity): Once we allow relationships between, use: code_tag.attrib.get("parentCode")
            model_to_name_to_values[model_name][name] = value

    return model_to_name_to_values


enum_dir = "enums"
safe_mkdir(enum_dir)
# NOTE: Some Databases exist, but their schema is NOT present. E.g. MATERIAL_RESOURCES
datasets = ["AEA", "AEI_OTHER", "AIR_GHG", "AV_AN_WAGE", "CITIES", "DUR_I", "EAG_NEAC", "EAG_TRANS", "GENDER_EMP", "GREEN_GROWTH", "FIN_IND_FBS", "HH_DASH", "IDD", "JOBQ", "LFS_SEXAGE_I_R", "MATERIAL_RESOURCES:", "MEI", "MEI_CLI", "MIG", "MSTI_PUB", "NAAG", "PDB_GR", "PDB_LV", "PNNI_NEW", "PPPGDP", "REV", "RS_GBL", "QNA", "SHA", "SNA_TABLE1", "SNA_TABLE5", "SOCX_AGG", "STLABOUR", "ULC_QUA", "WEALTH"]
urls = [f"https://stats.oecd.org/restsdmx/sdmx.ashx/GetDataStructure/{dataset}" for dataset in datasets]
# TODO: Maybe use explanation sites like https://data.oecd.org/fdi/fdi-stocks.htm to generate doc-strings
#  for values. More ideas can be found by searching https://data.oecd.org/searchresults/
# TODO: Might have useful docustrings: https://www.oecd.org/els/health-systems/List-of-variables-OECD-Health-Statistics-2018.pdf

gather_and_generate_enums(
    urls=urls,
    output_filepath=f"{enum_dir}/all.py",
    parse_response=parse_oecd_schema_response,
    ignore_status_codes=[HTTPStatus.BAD_REQUEST],
)

コード例 #6
0
        raise Exception(f"Non-200 status code: {response.status_code}: {response.text[:100]} url={url}")

    with open(filepath, "w") as output_file:
        if content_type == ContentType.JSON:
            # Put it through json load / dump to verify it's a correct json.
            dataset = json.loads(response.content)
            LOGGER.info(f"{dataset['header']}")
            json.dump(dataset, output_file)
            # output_file.write(str(response.content))
        elif content_type == ContentType.CSV:
            output_file.write(response.text)
        else:
            Exception(f"Unexpected content type {content_type}")


safe_mkdir("data")

# ==== Initial run
# db_code_manual_list = ["MEI", "MEI_CLI", "SNA", "HEALTH_STATE", "CRSNEW", "NAAG", "SHA", "STLABOUR", "SOCX_AGG", "MSTI_PUB", "CITIES", "QNA", "PDB_GR", "IDD", "MIG", "PDB_LV", "LFS_SEXAGE_I_R", "REV", "PNNI_NEW", "PPPGDP", "GREEN_GROWTH", "AEI_OTHER", "WEALTH", "ULC_QUA", "RS_GBL", "EAG_NEAC", "AEA", "DUR_I", "EAG_TRANS", "AV_AN_WAGE", "GENDER_EMP", "JOBQ", "HH_DASH", "IDO", "AIR_GHG", "FIN_IND_FBS", "MATERIAL_R"]
# codes_to_raw_names = {}
# list_database_codes(db_code_manual_list, codes_to_raw_names, 3)
#
# # Transform into enum names
# name_to_values = {}
# for code, raw_name in sorted(codes_to_raw_names.items()):
#     name_to_values[enumizy_name(raw_name)] = code
# generate_enums({"DatabaseCode": name_to_values}, "enums/database_codes.py")

# ==== Download all the data omnomnomnom.
for year in range(2019, 2008, -1):
    LOGGER.info(f"Year: {year}")