Esempio n. 1
0
def run_experiment(dataset_id: str):
    ROOT_DIR = Path(__file__).parents[2]

    dataset_folder = ROOT_DIR / "datasets" / dataset_id
    outputs_folder = ROOT_DIR / "outputs" / dataset_id
    output_filepath = outputs_folder / "scoring.pickled"
    selected_samples_filepath = outputs_folder / "selected.csv"

    graphs_index = load_graph_index(dataset_id)

    print(f"> Testing predicting the team labels for {dataset_id}...")

    # This dataset is balanced and does not need balancing
    print(
        "Current number of players in each team:\n", graphs_index.label.value_counts()
    )
    selected_graphs = graphs_index
    selected_graphs.graph_file.to_csv(selected_samples_filepath)

    print(f"Generating GraKeL graphs for {len(selected_graphs)} files")
    selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder)

    cv_sets = get_fixed_CV_sets(selected_graphs, selected_graphs.label)
    print(f"Generated {len(cv_sets)} cross-validation train/test sets.")

    results = test_prediction_on_classifiers(
        selected_graphs[POKEMON_GO_DATA_COLUMNS], selected_graphs.label, cv_sets
    )

    pna_results = test_prediction_on_classifiers(
        selected_graphs[NETWORK_METRIC_NAMES],
        selected_graphs.label,
        cv_sets,
        test_prefix="PNA-",
    )
    pna_results["time"] = selected_graphs.timings_PNA.sum()
    results = results.append(pna_results, ignore_index=True)

    results = results.append(
        test_prediction_on_Grakel_kernels(
            selected_graphs, "label", cv_sets, ignore_kernels={"GK-GSamp"}
        ),
        ignore_index=True,
    )

    results = results.append(
        test_prediction_on_kernels(selected_graphs, outputs_folder, "label", cv_sets),
        ignore_index=True,
    )

    print("Saving scoring to:", output_filepath)
    results.to_pickle(output_filepath)
Esempio n. 2
0
def main(output_folder: str, kernel: str, y_column: str, to_level: int):
    output_path = Path(output_folder)
    dataset_id = output_path.name
    graphs_index = load_graph_index(dataset_id)
    selected_samples_filepath = output_path / "selected.csv"
    selected_graphfiles = pd.read_csv(selected_samples_filepath, index_col=0)
    selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy()

    print(f"Training {kernel}-{to_level} on {len(selected_graphs)} {dataset_id} graphs...")
    clf = train_provenance_kernel_pipeline(
        selected_graphs, output_path, kernel, to_level, y_column
    )
    models_folder = output_path / "models"
    models_folder.mkdir(parents=True, exist_ok=True)
    model_filepath = models_folder / f"model_{kernel}_{to_level}.pickled"
    with model_filepath.open("wb") as f:
        pickle.dump(clf, f)
Esempio n. 3
0
def run_experiment(dataset_id: str):
    ROOT_DIR = Path(__file__).parents[2]

    dataset_folder = ROOT_DIR / "datasets" / dataset_id
    outputs_folder = ROOT_DIR / "outputs" / dataset_id
    output_filepath = outputs_folder / "scoring.pickled"
    selected_samples_filepath = outputs_folder / "selected.csv"

    graphs_index = load_graph_index(dataset_id)

    print(f"> Testing predicting the data quality labels for {dataset_id}...")

    # Balancing the dataset on the trusted attribute
    print("Current number of trusted values:\n",
          graphs_index.trusted.value_counts())
    selected_graphs = graphs_index[graphs_index.trusted == False]
    selected_graphs = selected_graphs.append(
        graphs_index[graphs_index.trusted == True].sample(
            len(selected_graphs)))
    print(
        "Number of trusted values in selected graphs:\n",
        selected_graphs.trusted.value_counts(),
    )
    selected_graphs.graph_file.to_csv(selected_samples_filepath)

    print(f"Generating GraKeL graphs for {len(selected_graphs)} files")
    selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder)

    # ---------- Ayah's code ----------------------------------------------
    selected_graphs_filepath = outputs_folder / f"{dataset_id}.pickled"
    selected_graphs_filepath_2 = outputs_folder / f"{dataset_id}-Analytics.pickled"
    index = selected_graphs.columns.get_loc(NETWORK_METRIC_NAMES[0])
    applicationdata = selected_graphs.columns[:index]

    X = selected_graphs[applicationdata]
    y = selected_graphs.trusted

    data = pd.concat([X, y], axis=1)
    data = data.rename(columns={"trusted": "Class"})
    data.to_pickle(selected_graphs_filepath)

    X = selected_graphs[NETWORK_METRIC_NAMES]
    y = selected_graphs.trusted

    data = pd.concat([X, y], axis=1)
    data = data.rename(columns={"trusted": "Class"})
    data.to_pickle(selected_graphs_filepath_2)
    # --------------------------------------------------------------------

    cv_sets = get_fixed_CV_sets(selected_graphs, selected_graphs.trusted)
    print(f"Generated {len(cv_sets)} cross-validation train/test sets.")

    results = test_prediction_on_classifiers(
        selected_graphs[NETWORK_METRIC_NAMES],
        selected_graphs.trusted,
        cv_sets,
        test_prefix="PNA-",
    )
    results["time"] = selected_graphs.timings_PNA.sum()

    results = results.append(
        test_prediction_on_Grakel_kernels(selected_graphs, "trusted", cv_sets),
        ignore_index=True,
    )

    results = results.append(
        test_prediction_on_kernels(selected_graphs, outputs_folder, "trusted",
                                   cv_sets),
        ignore_index=True,
    )

    print("Saving scoring to:", output_filepath)
    results.to_pickle(output_filepath)
    test_prediction_on_classifiers,
    test_prediction_on_kernels,
    test_prediction_on_Grakel_kernels,
)
from scripts.utils import load_graph_index

ROOT_DIR = Path(__file__).parents[2]

data_folder = ROOT_DIR / "datasets"
dataset_id = "MIMIC-PXC7"
dataset_folder = ROOT_DIR / "datasets" / dataset_id
outputs_folder = ROOT_DIR / "outputs" / dataset_id
output_filepath = outputs_folder / "scoring.pickled"
selected_samples_filepath = outputs_folder / "selected.csv"

graphs_index = load_graph_index(dataset_id)

print("> Testing predicting a patient is dead at the end of this admission")

if selected_samples_filepath.exists():
    # Loading the previously saved balanced dataset to reproduce the same experiment
    selected_graphfiles = pd.read_csv(selected_samples_filepath, index_col=0)
    selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy()
else:
    # This is the first time we run this experiment
    # Selecting relevant graphs and balancing the dataset
    print(" - Current number of dead values:\n",
          graphs_index.dead.value_counts())
    selected_graphs = graphs_index[graphs_index.dead == True]
    selected_graphs = selected_graphs.append(
        graphs_index[graphs_index.dead == False].sample(len(selected_graphs)))
def run_experiment(dataset_id: str):
    ROOT_DIR = Path(__file__).parents[2]

    dataset_folder = ROOT_DIR / "datasets" / dataset_id
    outputs_folder = ROOT_DIR / "outputs" / dataset_id
    output_filepath = outputs_folder / "scoring.pickled"
    selected_samples_filepath = outputs_folder / "selected.csv"

    graphs_index = load_graph_index(dataset_id)

    print(f"> Testing predicting the data quality labels for {dataset_id}...")

    if selected_samples_filepath.exists():
        # Loading the previously saved balanced dataset to reproduce the same experiment
        selected_graphfiles = pd.read_csv(selected_samples_filepath,
                                          index_col=0)
        selected_graphs = graphs_index.iloc[selected_graphfiles.index].copy()
    else:
        # This is the first time we run this experiment
        # Balancing the dataset on the trusted attribute
        print(
            " - Current number of trusted values:\n",
            graphs_index.trusted.value_counts(),
        )
        selected_graphs = graphs_index[graphs_index.trusted == False]
        selected_graphs = selected_graphs.append(
            graphs_index[graphs_index.trusted == True].sample(
                len(selected_graphs)))
        print(
            " - Number of trusted values in selected graphs:\n",
            selected_graphs.trusted.value_counts(),
        )
        # saving the list of selected graphs for later reproduction of this experiment
        selected_graphs.graph_file.to_csv(selected_samples_filepath)

    print(f"> Generating GraKeL graphs for {len(selected_graphs)} files")
    selected_graphs = build_grakel_graphs(selected_graphs, dataset_folder)

    cv_sets = get_fixed_CV_sets(selected_graphs,
                                selected_graphs.trusted,
                                output_path=outputs_folder)
    print(f"> Got {len(cv_sets)} cross-validation train/test sets.")

    results = test_prediction_on_classifiers(
        selected_graphs[NETWORK_METRIC_NAMES],
        outputs_folder,
        selected_graphs.trusted,
        cv_sets,
        test_prefix="PNA-",
    )
    results["time"] = selected_graphs.timings_PNA.sum()

    results = results.append(
        test_prediction_on_Grakel_kernels(selected_graphs, outputs_folder,
                                          "trusted", cv_sets),
        ignore_index=True,
    )

    results = results.append(
        test_prediction_on_kernels(selected_graphs, outputs_folder, "trusted",
                                   cv_sets),
        ignore_index=True,
    )

    print("> Saving scoring to:", output_filepath)
    results.to_pickle(output_filepath)