Ejemplo n.º 1
0
def main(argv=None):
    opt = parse_args(argv)

    tasks = TCGAMeta(download=True, preload=True)
    task = tasks[113]

    # Setup the results dictionary
    filename = "experiments/results/clinical-tasks.pkl"
    try:
        results = pickle.load(open(filename, "rb"), encoding='latin1')
        print("Loaded Checkpointed Results")
    except Exception as e:
        print(e)
        results = pd.DataFrame(columns=[
            'task', 'acc_metric', 'model', 'graph', 'trial', 'train_size',
            'time_elapsed'
        ])
        print("Created a New Results Dictionary")

    train_size = 50
    trials = 3
    cuda = True
    exp = []

    for trial in range(trials):
        model = GCN(cuda=cuda,
                    dropout=opt.dropout,
                    num_layer=opt.num_layer,
                    channels=opt.channels,
                    embedding=opt.embedding,
                    aggregation=opt.aggregation,
                    lr=opt.lr,
                    agg_reduce=opt.agg_reduce,
                    seed=trial)
        task._samples = task._samples - task._samples.mean(axis=0)
        task._samples = task._samples / task._samples.var()
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            task._samples,
            task._labels,
            stratify=task._labels,
            train_size=train_size,
            test_size=len(task._labels) - train_size)
        adj = sparse.csr_matrix(nx.to_numpy_matrix(GeneManiaGraph().nx_graph))
        model.fit(X_train, y_train, adj=adj)

        y_hat = []
        for chunk in get_every_n(X_test, 10):
            y_hat.extend(np.argmax(model.predict(chunk), axis=1).numpy())

        exp.append(model.metric(y_test, y_hat))
        print(exp)
    report_results([{
        "name": "acc_metric",
        "type": "objective",
        "value": np.array(exp).mean()
    }])
def main(argv=None):
    opt = parse_args(argv)
    dataset = datasets.TCGADataset()
    dataset.df = dataset.df - dataset.df.mean(axis=0)

    gene_graph = GeneManiaGraph()
    search_num_genes = [50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16300]
    test_size = 300
    cuda = torch.cuda.is_available()
    exp = []
    for num_genes in search_num_genes:
        start_time = time.time()
        gene = "RPL4"
        model = GCN(cuda=cuda,
                    dropout=opt.dropout,
                    num_layer=opt.num_layer,
                    channels=opt.channels,
                    embedding=opt.embedding,
                    aggregation=opt.aggregation,
                    lr=opt.lr,
                    agg_reduce=opt.agg_reduce)
        dataset.labels = dataset.df[gene].where(
            dataset.df[gene] > 0).notnull().astype("int")
        dataset.labels = dataset.labels.values if type(
            dataset.labels) == pd.Series else dataset.labels
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            dataset.df,
            dataset.labels,
            stratify=dataset.labels,
            train_size=opt.train_size,
            test_size=opt.test_size,
            random_state=opt.seed)
        if num_genes == 16300:
            neighbors = gene_graph.nx_graph
        else:
            neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

        X_train = X_train[list(neighbors.nodes)].copy()
        X_test = X_test[list(neighbors.nodes)].copy()
        X_train[gene] = 1
        X_test[gene] = 1
        adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors))
        model.fit(X_train, y_train, adj=adj)

        y_hat = model.predict(X_test)
        y_hat = np.argmax(y_hat, axis=1)
        auc = sklearn.metrics.roc_auc_score(y_test,
                                            np.asarray(y_hat).flatten())
        del model
        exp.append(auc)
    report_results([{
        "name": "auc",
        "type": "objective",
        "value": np.array(exp).mean()
    }])
Ejemplo n.º 3
0
                                lr=0.001,
                                num_epochs=100,
                                patience=30,
                                verbose=True,
                                seed=seed,
                                train_valid_split=0.8
                                )
        elif model_name == 'MLP64':
            model = MLP(name="MLP_lay2_chan64", cuda=cuda, dropout=True, num_layer=2, channels=64, train_valid_split=0.8, patience=30, lr=0.001)
        elif model_name == 'MLP64_lr4':
            model = MLP(name="MLP_lay2_chan64_lr.0.0001_nodropout", cuda=cuda, dropout=False, num_layer=2, channels=64, train_valid_split=0.8, patience=30, lr=0.0001)

        try:
            # print(x_train.shape,  y_train.shape, adj.shape)

            model.fit(x_train, y_train, adj=adj)

            with torch.no_grad():
                model.eval()
                y_hat = model.predict(x_test)
                y_hat = np.argmax(y_hat, axis=1)
                # auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten(), multi_class='ovo')
                acc = sklearn.metrics.accuracy_score(y_test, np.asarray(y_hat).flatten())
                f1 = sklearn.metrics.f1_score(y_test, np.asarray(y_hat).flatten(), average='macro')

                experiment["model"] = model.name
                experiment["auc"] = 0
                experiment["acc"] = acc
                experiment["f1"] = f1
                experiment["num_genes"] = len(x_train.columns)
            neighbors = list(gene_graph.first_degree(gene)[0])
            neighbors = [n for n in neighbors if n in X_train.columns.values]
            X_train = X_train.loc[:, neighbors].copy()
            X_test = X_test.loc[:, neighbors].copy()
        else:
            X_train = X_train.copy()
            X_test = X_test.copy()

        try:
            # Don't include expression of enquired gene?
            # X_train[gene] = 1
            # X_test[gene] = 1

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model.fit(X_train, y_train, adj)
                model.eval()
                with torch.no_grad():
                    y_hat = model.predict(X_test)
            auc = sklearn.metrics.roc_auc_score(y_test, np.argmax(y_hat,
                                                                  axis=1))
            acc = sklearn.metrics.accuracy_score(y_test,
                                                 np.argmax(y_hat, axis=1))
            print("auc:", auc, " acc: ", acc)
            experiment["auc"] = auc
            experiment["acc"] = acc
            results.append(experiment)
            if auc > best_auc:
                best_auc = copy.deepcopy(auc)
                best_auc_model = copy.deepcopy(model)
            if acc > best_acc:
                        patience=30,
                        lr=0.001)
        elif model_name == 'MLP64_lr4':
            model = MLP(name="MLP_lay2_chan64_lr.0.0001_nodropout",
                        cuda=cuda,
                        dropout=False,
                        num_layer=2,
                        channels=64,
                        train_valid_split=0.8,
                        patience=30,
                        lr=0.0001)

        try:
            # print(x_train.shape,  y_train.shape, adj.shape)

            model.fit(x_train, y_train, adj=adj, ontology_vectors=emb_vectors)

            with torch.no_grad():
                model.eval()
                y_hat = model.predict(x_test)
                y_hat = np.argmax(y_hat, axis=1)
                # auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten(), multi_class='ovo')
                acc = sklearn.metrics.accuracy_score(
                    y_test,
                    np.asarray(y_hat).flatten())
                f1 = sklearn.metrics.f1_score(y_test,
                                              np.asarray(y_hat).flatten(),
                                              average='macro')

                experiment["model"] = model.name
                experiment["auc"] = 0