def run_experiment(dataset_id: str): ROOT_DIR = Path(__file__).parents[2] dataset_folder = ROOT_DIR / "datasets" / dataset_id outputs_folder = ROOT_DIR / "outputs" / dataset_id output_filepath = outputs_folder / "scoring.pickled" selected_samples_filepath = outputs_folder / "selected.csv" graphs_index = load_graph_index(dataset_id) print(f"> Testing predicting the team labels for {dataset_id}...") # This dataset is balanced and does not need balancing print( "Current number of players in each team:\n", graphs_index.label.value_counts() ) selected_graphs = graphs_index selected_graphs.graph_file.to_csv(selected_samples_filepath) print(f"Generating GraKeL graphs for {len(selected_graphs)} files") selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder) cv_sets = get_fixed_CV_sets(selected_graphs, selected_graphs.label) print(f"Generated {len(cv_sets)} cross-validation train/test sets.") results = test_prediction_on_classifiers( selected_graphs[POKEMON_GO_DATA_COLUMNS], selected_graphs.label, cv_sets ) pna_results = test_prediction_on_classifiers( selected_graphs[NETWORK_METRIC_NAMES], selected_graphs.label, cv_sets, test_prefix="PNA-", ) pna_results["time"] = selected_graphs.timings_PNA.sum() results = results.append(pna_results, ignore_index=True) results = results.append( test_prediction_on_Grakel_kernels( selected_graphs, "label", cv_sets, ignore_kernels={"GK-GSamp"} ), ignore_index=True, ) results = results.append( test_prediction_on_kernels(selected_graphs, outputs_folder, "label", cv_sets), ignore_index=True, ) print("Saving scoring to:", output_filepath) results.to_pickle(output_filepath)
def main(output_folder: str, kernel: str, y_column: str, to_level: int): output_path = Path(output_folder) dataset_id = output_path.name graphs_index = load_graph_index(dataset_id) selected_samples_filepath = output_path / "selected.csv" selected_graphfiles = pd.read_csv(selected_samples_filepath, index_col=0) selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy() print(f"Training {kernel}-{to_level} on {len(selected_graphs)} {dataset_id} graphs...") clf = train_provenance_kernel_pipeline( selected_graphs, output_path, kernel, to_level, y_column ) models_folder = output_path / "models" models_folder.mkdir(parents=True, exist_ok=True) model_filepath = models_folder / f"model_{kernel}_{to_level}.pickled" with model_filepath.open("wb") as f: pickle.dump(clf, f)
def run_experiment(dataset_id: str): ROOT_DIR = Path(__file__).parents[2] dataset_folder = ROOT_DIR / "datasets" / dataset_id outputs_folder = ROOT_DIR / "outputs" / dataset_id output_filepath = outputs_folder / "scoring.pickled" selected_samples_filepath = outputs_folder / "selected.csv" graphs_index = load_graph_index(dataset_id) print(f"> Testing predicting the data quality labels for {dataset_id}...") # Balancing the dataset on the trusted attribute print("Current number of trusted values:\n", graphs_index.trusted.value_counts()) selected_graphs = graphs_index[graphs_index.trusted == False] selected_graphs = selected_graphs.append( graphs_index[graphs_index.trusted == True].sample( len(selected_graphs))) print( "Number of trusted values in selected graphs:\n", selected_graphs.trusted.value_counts(), ) selected_graphs.graph_file.to_csv(selected_samples_filepath) print(f"Generating GraKeL graphs for {len(selected_graphs)} files") selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder) # ---------- Ayah's code ---------------------------------------------- selected_graphs_filepath = outputs_folder / f"{dataset_id}.pickled" selected_graphs_filepath_2 = outputs_folder / f"{dataset_id}-Analytics.pickled" index = selected_graphs.columns.get_loc(NETWORK_METRIC_NAMES[0]) applicationdata = selected_graphs.columns[:index] X = selected_graphs[applicationdata] y = selected_graphs.trusted data = pd.concat([X, y], axis=1) data = data.rename(columns={"trusted": "Class"}) data.to_pickle(selected_graphs_filepath) X = selected_graphs[NETWORK_METRIC_NAMES] y = selected_graphs.trusted data = pd.concat([X, y], axis=1) data = data.rename(columns={"trusted": "Class"}) data.to_pickle(selected_graphs_filepath_2) # -------------------------------------------------------------------- cv_sets = get_fixed_CV_sets(selected_graphs, selected_graphs.trusted) print(f"Generated {len(cv_sets)} cross-validation train/test sets.") results = test_prediction_on_classifiers( selected_graphs[NETWORK_METRIC_NAMES], selected_graphs.trusted, cv_sets, test_prefix="PNA-", ) results["time"] = selected_graphs.timings_PNA.sum() results = results.append( test_prediction_on_Grakel_kernels(selected_graphs, "trusted", cv_sets), ignore_index=True, ) results = results.append( test_prediction_on_kernels(selected_graphs, outputs_folder, "trusted", cv_sets), ignore_index=True, ) print("Saving scoring to:", output_filepath) results.to_pickle(output_filepath)
test_prediction_on_classifiers, test_prediction_on_kernels, test_prediction_on_Grakel_kernels, ) from scripts.utils import load_graph_index ROOT_DIR = Path(__file__).parents[2] data_folder = ROOT_DIR / "datasets" dataset_id = "MIMIC-PXC7" dataset_folder = ROOT_DIR / "datasets" / dataset_id outputs_folder = ROOT_DIR / "outputs" / dataset_id output_filepath = outputs_folder / "scoring.pickled" selected_samples_filepath = outputs_folder / "selected.csv" graphs_index = load_graph_index(dataset_id) print("> Testing predicting a patient is dead at the end of this admission") if selected_samples_filepath.exists(): # Loading the previously saved balanced dataset to reproduce the same experiment selected_graphfiles = pd.read_csv(selected_samples_filepath, index_col=0) selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy() else: # This is the first time we run this experiment # Selecting relevant graphs and balancing the dataset print(" - Current number of dead values:\n", graphs_index.dead.value_counts()) selected_graphs = graphs_index[graphs_index.dead == True] selected_graphs = selected_graphs.append( graphs_index[graphs_index.dead == False].sample(len(selected_graphs)))
def run_experiment(dataset_id: str): ROOT_DIR = Path(__file__).parents[2] dataset_folder = ROOT_DIR / "datasets" / dataset_id outputs_folder = ROOT_DIR / "outputs" / dataset_id output_filepath = outputs_folder / "scoring.pickled" selected_samples_filepath = outputs_folder / "selected.csv" graphs_index = load_graph_index(dataset_id) print(f"> Testing predicting the data quality labels for {dataset_id}...") if selected_samples_filepath.exists(): # Loading the previously saved balanced dataset to reproduce the same experiment selected_graphfiles = pd.read_csv(selected_samples_filepath, index_col=0) selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy() else: # This is the first time we run this experiment # Balancing the dataset on the trusted attribute print( " - Current number of trusted values:\n", graphs_index.trusted.value_counts(), ) selected_graphs = graphs_index[graphs_index.trusted == False] selected_graphs = selected_graphs.append( graphs_index[graphs_index.trusted == True].sample( len(selected_graphs))) print( " - Number of trusted values in selected graphs:\n", selected_graphs.trusted.value_counts(), ) # saving the list of selected graphs for later reproduction of this experiment selected_graphs.graph_file.to_csv(selected_samples_filepath) print(f"> Generating GraKeL graphs for {len(selected_graphs)} files") selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder) cv_sets = get_fixed_CV_sets(selected_graphs, selected_graphs.trusted, output_path=outputs_folder) print(f"> Got {len(cv_sets)} cross-validation train/test sets.") results = test_prediction_on_classifiers( selected_graphs[NETWORK_METRIC_NAMES], outputs_folder, selected_graphs.trusted, cv_sets, test_prefix="PNA-", ) results["time"] = selected_graphs.timings_PNA.sum() results = results.append( test_prediction_on_Grakel_kernels(selected_graphs, outputs_folder, "trusted", cv_sets), ignore_index=True, ) results = results.append( test_prediction_on_kernels(selected_graphs, outputs_folder, "trusted", cv_sets), ignore_index=True, ) print("> Saving scoring to:", output_filepath) results.to_pickle(output_filepath)