def _prepare_code2vec(args, params): root_path = extract_root_path(args["raw_dataset"]) path_contexts = {} datasets = {} def compute_fold_path(label): return "{}_{}.json".format(root_path, label) for fold in ["dev", "test", "train"]: fold_path = compute_fold_path(fold) datasets[fold] = load_dataset(fold_path) destination = Path.cwd() / "data" / "code" / fold dataset = SourceDataset(args, datasets[fold]) result = dataset.preprocess_ast(path=destination, force_rewrite=False) path_contexts[fold] = result train_data = path_contexts["train"] for fold in ["dev", "test"]: fold_data = match_ast_data(path_contexts[fold], train_data) dataset = save_code2vec_index(datasets[fold], fold_data, fold) dump_dataset(compute_fold_path(fold), dataset) dataset = save_code2vec_index(datasets["train"], train_data, "train") dump_dataset(compute_fold_path("train"), dataset)
def main(): git_repos = {"repo_full_name": []} for query_url in QUERY_URL_TEMPLATES: # Query templates query_url_page = query_url + "&page=" resp = requests.get(query_url, auth=('raresraf', TOKEN)).json() NO_REQUESTS = min(int(resp["total_count"] / PER_PAGE), int(GITHUB_MAX_RESULTS / PER_PAGE)) for i in range(1, NO_REQUESTS + 1 + 1): # Iterate NO_REQUESTS time print("[ITERATION] ", i, "/", NO_REQUESTS + 1) query_url_page_formatted = query_url_page + str(i) resp = requests.get(query_url_page_formatted, auth=('raresraf', TOKEN)).json() retry = MAX_RETRY while retry > 0 and not resp.get("items", None): # Retry in case of failure, MAX_RETRY times print("... [retry]", MAX_RETRY - retry + 1, "/", MAX_RETRY) resp = requests.get(query_url_page + str(i), auth=('raresraf', TOKEN)).json() sleep(10 * SLEEP_TIME_BETWEEN_API_CALLS_SEC) retry = retry - 1 if retry == 0: continue for r in resp["items"]: git_repos["repo_full_name"].append(r["full_name"]) sleep(SLEEP_TIME_BETWEEN_API_CALLS_SEC) # Write output to REPOSITORIES_OUTPUT_PATH dump_dataset(REPOSITORIES_OUTPUT_PATH, git_repos)
def closed(self, spider): print("\n[donations_spider] closed() ") print("Unidentified Pages:\n") print(self.unidentified) dump_dataset("./data/unidentified_pages.json", self.unidentified) dump_dataset("./data/donations.json", self.all_donations) # for debug, print the names of all columns print_defaultdict(self.all_column_types)
def _prepare_safe(args, params): params = args["features"]["types"]["safe"] safe = SAFE(params["model"], params["instr_conv"], params["max_instr"]) root_path = extract_root_path(args["raw_dataset"]) for ds_type in ["test", "dev", "train"]: path = "{}_{}.json".format(root_path, ds_type) data = load_dataset(path) data = compute_safe_embeddings(args, safe, data) dump_dataset(path, data)
def main(): # Read input from to REPOSITORIES_OUTPUT_PATH data = load_dataset(REPOSITORIES_OUTPUT_PATH) # Remove duplicates from data["repo_full_name"] # This preserves list order. seen = set() result = [] for item in data["repo_full_name"]: if item not in seen: seen.add(item) result.append(item) # Write output to REPOSITORIES_OUTPUT_PATH dump_dataset(REPOSITORIES_OUTPUT_PATH, data)
def split_dataset(args): root_path = extract_root_path(args["raw_dataset"]) new_path = "{}_prepared.json".format(root_path) dataset = load_dataset(new_path) # Custom dataset preprocessing module handler = identify_data_handler(args) dataset = handler(args, dataset) # Split train/dev/test data_split = dataset.split_data(verbose=True) for key in data_split: split_path = "{}_{}.json".format(root_path, key) dump_dataset(split_path, data_split[key])
def prepare_input(args): pipeline = SourcePipeline(args).init_from_model_config() res_path = get_model_path(args) root_path = extract_root_path(args["raw_dataset"]) for ds_type in ["test", "dev", "train"]: path = "{}_{}.json".format(root_path, ds_type) if not os.path.exists(path): logging.critical("Expected {} file in path {}.".format( ds_type, path)) exit(1) data = load_dataset(path) X, Y = pipeline.run(data) if isinstance(X, tuple): X, X_meta = X meta_path = res_path / "X_{}_meta.json".format(ds_type) dump_dataset(meta_path, X_meta) if len(X[0]) != len(Y): logging.critical("X num samples - {} \n".format(len(X[0]))) logging.critical("Y num samples - {} \n".format(len(Y))) logging.critical("Total num samples - {}\n".format(len(data))) raise Exception( "Mismatch between number of input and output samples") input_path = res_path / "X_{}.json".format(ds_type) dump_dataset(input_path, X) output_path = res_path / "Y_{}.json".format(ds_type) dump_dataset(output_path, Y)
def closed(self, spider): dump_dataset("./data/mo_list.json", self.targets)
def serialize(self, ds_path=None): dump_dataset(ds_path, self.data)