Esempio n. 1
0
def _prepare_code2vec(args, params):

    root_path = extract_root_path(args["raw_dataset"])
    path_contexts = {}

    datasets = {}

    def compute_fold_path(label):
        return "{}_{}.json".format(root_path, label)

    for fold in ["dev", "test", "train"]:
        fold_path = compute_fold_path(fold)
        datasets[fold] = load_dataset(fold_path)

        destination = Path.cwd() / "data" / "code" / fold

        dataset = SourceDataset(args, datasets[fold])
        result = dataset.preprocess_ast(path=destination, force_rewrite=False)
        path_contexts[fold] = result

    train_data = path_contexts["train"]
    for fold in ["dev", "test"]:
        fold_data = match_ast_data(path_contexts[fold], train_data)
        dataset = save_code2vec_index(datasets[fold], fold_data, fold)
        dump_dataset(compute_fold_path(fold), dataset)

    dataset = save_code2vec_index(datasets["train"], train_data, "train")
    dump_dataset(compute_fold_path("train"), dataset)
Esempio n. 2
0
def main():
    git_repos = {"repo_full_name": []}

    for query_url in QUERY_URL_TEMPLATES:
        # Query templates
        query_url_page = query_url + "&page="
        resp = requests.get(query_url, auth=('raresraf', TOKEN)).json()

        NO_REQUESTS = min(int(resp["total_count"] / PER_PAGE),
                          int(GITHUB_MAX_RESULTS / PER_PAGE))
        for i in range(1, NO_REQUESTS + 1 + 1):
            # Iterate NO_REQUESTS time
            print("[ITERATION] ", i, "/", NO_REQUESTS + 1)
            query_url_page_formatted = query_url_page + str(i)
            resp = requests.get(query_url_page_formatted,
                                auth=('raresraf', TOKEN)).json()
            retry = MAX_RETRY
            while retry > 0 and not resp.get("items", None):
                # Retry in case of failure, MAX_RETRY times
                print("... [retry]", MAX_RETRY - retry + 1, "/", MAX_RETRY)
                resp = requests.get(query_url_page + str(i),
                                    auth=('raresraf', TOKEN)).json()
                sleep(10 * SLEEP_TIME_BETWEEN_API_CALLS_SEC)
                retry = retry - 1
            if retry == 0:
                continue
            for r in resp["items"]:
                git_repos["repo_full_name"].append(r["full_name"])

            sleep(SLEEP_TIME_BETWEEN_API_CALLS_SEC)

    # Write output to REPOSITORIES_OUTPUT_PATH
    dump_dataset(REPOSITORIES_OUTPUT_PATH, git_repos)
Esempio n. 3
0
    def closed(self, spider):

        print("\n[donations_spider] closed() ")
        print("Unidentified Pages:\n")
        print(self.unidentified)

        dump_dataset("./data/unidentified_pages.json", self.unidentified)
        dump_dataset("./data/donations.json", self.all_donations)

        # for debug, print the names of all columns
        print_defaultdict(self.all_column_types)
Esempio n. 4
0
def _prepare_safe(args, params):

    params = args["features"]["types"]["safe"]
    safe = SAFE(params["model"], params["instr_conv"], params["max_instr"])
    root_path = extract_root_path(args["raw_dataset"])

    for ds_type in ["test", "dev", "train"]:
        path = "{}_{}.json".format(root_path, ds_type)
        data = load_dataset(path)
        data = compute_safe_embeddings(args, safe, data)
        dump_dataset(path, data)
Esempio n. 5
0
def main():
    # Read input from to REPOSITORIES_OUTPUT_PATH
    data = load_dataset(REPOSITORIES_OUTPUT_PATH)

    # Remove duplicates from data["repo_full_name"]
    # This preserves list order.
    seen = set()
    result = []
    for item in data["repo_full_name"]:
        if item not in seen:
            seen.add(item)
            result.append(item)

    # Write output to REPOSITORIES_OUTPUT_PATH
    dump_dataset(REPOSITORIES_OUTPUT_PATH, data)
Esempio n. 6
0
def split_dataset(args):

    root_path = extract_root_path(args["raw_dataset"])
    new_path = "{}_prepared.json".format(root_path)
    dataset = load_dataset(new_path)

    # Custom dataset preprocessing module
    handler = identify_data_handler(args)
    dataset = handler(args, dataset)

    # Split train/dev/test
    data_split = dataset.split_data(verbose=True)

    for key in data_split:
        split_path = "{}_{}.json".format(root_path, key)
        dump_dataset(split_path, data_split[key])
Esempio n. 7
0
def prepare_input(args):

    pipeline = SourcePipeline(args).init_from_model_config()
    res_path = get_model_path(args)

    root_path = extract_root_path(args["raw_dataset"])
    for ds_type in ["test", "dev", "train"]:
        path = "{}_{}.json".format(root_path, ds_type)
        if not os.path.exists(path):
            logging.critical("Expected {} file in path {}.".format(
                ds_type, path))
            exit(1)

        data = load_dataset(path)
        X, Y = pipeline.run(data)

        if isinstance(X, tuple):
            X, X_meta = X
            meta_path = res_path / "X_{}_meta.json".format(ds_type)
            dump_dataset(meta_path, X_meta)

        if len(X[0]) != len(Y):
            logging.critical("X num samples - {} \n".format(len(X[0])))
            logging.critical("Y num samples - {} \n".format(len(Y)))
            logging.critical("Total num samples - {}\n".format(len(data)))
            raise Exception(
                "Mismatch between number of input and output samples")

        input_path = res_path / "X_{}.json".format(ds_type)
        dump_dataset(input_path, X)

        output_path = res_path / "Y_{}.json".format(ds_type)
        dump_dataset(output_path, Y)
Esempio n. 8
0
 def closed(self, spider):
     dump_dataset("./data/mo_list.json", self.targets)
Esempio n. 9
0
 def serialize(self, ds_path=None):
     dump_dataset(ds_path, self.data)