"seed": seed,
        "train_size": train_size,
    }
    if args.percentile != 50:
        gene_percentile = np.percentile(dataset.df[gene].values, args.percentile)
        dataset.labels = dataset.df[gene].where(dataset.df[gene] > gene_percentile).notnull().astype("int")
    else:
        dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
    dataset.labels = dataset.labels.values if type(dataset.labels) == pd.Series else dataset.labels

    # if labels are chosen such that only one class exists, skip the gene
    # otherwise this throws a cuda device-side assert which breaks all
    # subsequent experiments
    if np.unique(dataset.labels).shape[0] == 1:
        experiment['error'] = 'Expression distribution too narrow, skipping'
        results = record_result(results, experiment, filename)
        continue

    try:
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels,
                                                                                    stratify=dataset.labels,
                                                                                    train_size=train_size,
                                                                                    test_size=test_size,
                                                                                    random_state=seed)

    except ValueError:
        results = record_result(results, experiment, filename)
        continue
    if is_first_degree:
        if is_landmark:
            neighbors = landmark_genes
def doMLP():

    skopt_args = collections.OrderedDict()
    skopt_args["lr"] = Integer(2, 5)
    skopt_args["channels"] = Integer(4, 12)
    skopt_args["layers"] = Integer(1, 4)

    optimizer = skopt.Optimizer(dimensions=skopt_args.values(),
                                base_estimator="GP",
                                n_initial_points=3,
                                random_state=args.seed)
    print(skopt_args)

    best_valid_metric = 0
    test_for_best_valid_metric = 0
    best_config = None
    already_done = set()
    for i in range(10):
        suggestion = optimizer.ask()
        if str(suggestion) in already_done:
            continue
        already_done.add(str(suggestion))
        sdict = dict(zip(skopt_args.keys(), suggestion))
        sdict["lr"] = 10**float((-sdict["lr"]))
        sdict["channels"] = 2**sdict["channels"]

        model = models.mlp.MLP(name="MLP",
                               num_layer=sdict["layers"],
                               channels=sdict["channels"],
                               lr=sdict["lr"],
                               num_epochs=100,
                               patience=50,
                               cuda=torch.cuda.is_available(),
                               metric=sklearn.metrics.accuracy_score,
                               verbose=False,
                               seed=args.seed)

        model.fit(X_train, y_train)

        y_valid_pred = model.predict(X_valid)
        valid_metric = sklearn.metrics.accuracy_score(
            y_valid, np.argmax(y_valid_pred, axis=1))

        opt_results = optimizer.tell(suggestion, -valid_metric)
        print(opt_results)

        #record metrics to write and plot
        if best_valid_metric < valid_metric:
            best_valid_metric = valid_metric
            best_config = sdict

            y_test_pred = model.predict(X_test)
            test_metric = sklearn.metrics.accuracy_score(
                y_test, np.argmax(y_test_pred, axis=1))
            test_for_best_valid_metric = test_metric

        print(i, "This result:", valid_metric, sdict)

        experiment = {
            "model": model.name,
            "graph": "",
            "num_genes": len(list(X_train.columns)),
            "train_size": args.ntrain,
            "seed": args.seed,
            "acc": valid_metric,
            'lr': sdict["lr"],
            'channels': sdict["channels"],
            'embedding': 0,
            'num_layer': sdict["layers"],
            'prepool_extralayers': 0
        }

        global results
        results = record_result(results, experiment, filename)

    print("#Final Results", test_for_best_valid_metric, best_config)
    return test_metric, best_config
def doGGC():

    gene_graphs = [
        data.gene_graphs.OntologyGraph(neighbors=30,
                                       embeddings_name='dl2vec',
                                       randomize=False,
                                       gene_names=list(features.columns),
                                       relabel_genes=False),
    ]

    for graph in gene_graphs:

        adj = graph.adj()

        for dropout in [False]:  #, False]:
            import gc
            gc.collect()

            skopt_args = collections.OrderedDict()
            skopt_args["lr"] = Integer(3, 5)
            skopt_args["channels"] = Integer(3, 6)
            # skopt_args["embedding"]=Integer(4, 5)
            skopt_args["num_layer"] = Integer(1, 3)
            skopt_args["gat_heads"] = Integer(1, 3)
            skopt_args["prepool_extralayers"] = Integer(0, 1)

            optimizer = skopt.Optimizer(dimensions=skopt_args.values(),
                                        base_estimator="GP",
                                        n_initial_points=4,
                                        random_state=args.seed)
            print(skopt_args)

            best_valid_metric = 0
            test_for_best_valid_metric = 0
            best_config = None
            already_done = set()

            for i in range(100):
                import gc
                gc.collect()

                suggestion = optimizer.ask()

                if str(suggestion) in already_done:
                    continue
                already_done.add(str(suggestion))
                sdict = dict(zip(skopt_args.keys(), suggestion))
                sdict["lr"] = 10**float((-sdict["lr"]))
                sdict["channels"] = 2**sdict["channels"]
                sdict["gat_heads"] = 2**sdict["gat_heads"]
                sdict["embedding"] = 2  # 2**sdict["embedding"]
                print(sdict)

                neighbors = graph.nx_graph
                intersection_nodes = np.intersect1d(X_train.columns,
                                                    neighbors.nodes)
                x_train = X_train[list(intersection_nodes)].copy()
                x_valid = X_valid[list(intersection_nodes)].copy()

                toremove = set(neighbors.nodes)
                toremove = toremove.difference(intersection_nodes)
                neighbors.remove_nodes_from(toremove)

                adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors))

                model = models.gnn.GCN(
                    name="GAT",
                    dropout=dropout,
                    gnn="GAT",
                    gat_heads=sdict["gat_heads"],
                    cuda=torch.cuda.is_available(),
                    num_layer=sdict["num_layer"],
                    prepool_extralayers=sdict["prepool_extralayers"],
                    channels=sdict["channels"],
                    embedding=sdict["channels"],  #sdict["embedding"], 
                    aggregation=None,
                    lr=sdict["lr"],
                    num_epochs=100,
                    patience=40,
                    verbose=True,
                    seed=args.seed)

                try:
                    model.fit(x_train, y_train, adj)

                    with torch.no_grad():
                        model.eval()
                        y_valid_pred = model.predict(x_valid)
                        valid_metric = sklearn.metrics.accuracy_score(
                            y_valid, np.argmax(y_valid_pred, axis=1))

                        opt_results = optimizer.tell(suggestion, -valid_metric)

                        # #record metrics to write and plot
                        # if best_valid_metric < valid_metric:
                        #     best_valid_metric = valid_metric
                        #     print("best_valid_metric", best_valid_metric, sdict)
                        #     best_config = sdict

                        #     y_test_pred = model.predict(x_test)
                        #     test_metric = sklearn.metrics.accuracy_score(y_test, np.argmax(y_test_pred,axis=1))
                        #     test_for_best_valid_metric = test_metric

                        experiment = {
                            "model": model.name,
                            "graph": graph.graph_name,
                            "num_genes": len(x_train.columns),
                            "train_size": args.ntrain,
                            "seed": args.seed,
                            "acc": valid_metric,
                            'lr': sdict["lr"],
                            'channels': sdict["channels"],
                            'embedding': sdict["embedding"],
                            'num_layer': sdict["num_layer"],
                            'prepool_extralayers': sdict["prepool_extralayers"]
                        }
                        print(i, "This result:", valid_metric, experiment)

                        global results
                        results = record_result(results, experiment, filename)

                except Exception as e:
                    print(e)
                    logging.error(logging.traceback.format_exc())

                # cleanup
                model.best_model = None
                del model
                torch.cuda.empty_cache()

    print("#Final Results", test_for_best_valid_metric, best_config)
    return test_for_best_valid_metric, best_config
Beispiel #4
0
def doGGC():
    
    # if args.graph == "stringdb":
    #     graph = data.gene_graphs.StringDBGraph(datastore="./data")
    # elif args.graph == "genemania":
    #     graph = data.gene_graphs.GeneManiaGraph()
    # elif args.graph == "ontology":
    #     graph = data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el')
    # else:
    #     print("unknown graph")
    #     sys.exit(1)

    gene_graphs = [
        OntologyGraph(neighbors=30, embeddings_name='dl2vec', randomize=False, gene_names=list(features.columns), relabel_genes=False),
        OntologyGraph(neighbors=30, embeddings_name='el', randomize=False, gene_names=list(features.columns), relabel_genes=False)]
        # data.gene_graphs.OntologyGraph(neighbors=n, embeddings_name=emb) 
        # for n in [30] #[500, 100, 30, 10] 
        # for emb in ['el', 'dl2vec', 'opa2vec', 'opa2vec_go']] + [
        #     data.gene_graphs.GeneManiaGraph(), 
        #     data.gene_graphs.StringDBGraph(datastore="./data")]
        # data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el'), 
        # data.gene_graphs.OntologyGraph(neighbors=30, embeddings_name='el'), 
    # gene_graphs = [data.gene_graphs.OntologyGraph(neighbors=100, embeddings_name='el')]
        # data.gene_graphs.GeneManiaGraph(), 
        # data.gene_graphs.StringDBGraph(datastore="./data")]

    for graph in gene_graphs:

        adj = graph.adj()

        for num_genes in [1000, 16000]:
            for dropout in [True, False]: #, False]:
                import gc
                gc.collect()

                skopt_args = collections.OrderedDict()
                # skopt_args["lr"]=Integer(3, 4)
                skopt_args["channels"]=Integer(4, 9)
                # skopt_args["embedding"]=Integer(4, 5)
                skopt_args["num_layer"]=Integer(0, 4)
                skopt_args["prepool_extralayers"]=Integer(0, 3)

                optimizer = skopt.Optimizer(dimensions=skopt_args.values(),
                                            base_estimator="GP",
                                            n_initial_points=4,
                                            random_state=args.seed)
                print(skopt_args)



                best_valid_metric = 0
                test_for_best_valid_metric = 0
                best_config = None
                already_done = set()

                for i in range(15):
                    import gc
                    gc.collect()

                    suggestion = optimizer.ask()
                    
                    if str(suggestion) in already_done:
                        continue
                    already_done.add(str(suggestion))
                    sdict = dict(zip(skopt_args.keys(),suggestion))
                    sdict["lr"] = 0.001 #10**float((-sdict["lr"]))
                    sdict["channels"] = 2**sdict["channels"]
                    sdict["embedding"] = 2# 2**sdict["embedding"]
                    print(sdict)

                    gene = 'ESR1'
                    neighbors = graph.bfs_sample_neighbors(gene, num_genes*1.5)
                    intersection_nodes = np.intersect1d(features.columns, neighbors.nodes)

                    x_train = X_train[list(intersection_nodes)[:num_genes]].copy()
                    x_valid = X_valid[list(intersection_nodes)[:num_genes]].copy()
                    x_test = X_test[list(intersection_nodes)[:num_genes]].copy()

                    toremove = set(neighbors.nodes)
                    toremove = toremove.difference(intersection_nodes)
                    neighbors.remove_nodes_from(toremove)
                    adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors))

                    model = models.gcn.GCN(name="GCN_noemb" + ("_dropout" if dropout else ""), #_lay3_chan64_emb32_dropout_agg_hierarchy", 
                                        dropout=dropout, 
                                        cuda=torch.cuda.is_available(),
                                        num_layer=sdict["num_layer"],
                                        prepool_extralayers=sdict["prepool_extralayers"],
                                        channels=sdict["channels"], 
                                        embedding=sdict["embedding"], # sdict["embedding"], 
                                        aggregation="hierarchy",
                                        lr=sdict["lr"],
                                        num_epochs=100,
                                        patience=30,
                                        verbose=False,
                                        seed=args.seed
                                        )

                    try:
                        model.fit(x_train, y_train, adj)

                        with torch.no_grad():
                            model.eval()
                            y_valid_pred = model.predict(x_valid)
                            valid_metric = sklearn.metrics.accuracy_score(y_valid, np.argmax(y_valid_pred,axis=1))

                            opt_results = optimizer.tell(suggestion, - valid_metric) 

                            # #record metrics to write and plot
                            # if best_valid_metric < valid_metric:
                            #     best_valid_metric = valid_metric
                            #     print("best_valid_metric", best_valid_metric, sdict)
                            #     best_config = sdict

                            #     y_test_pred = model.predict(x_test)
                            #     test_metric = sklearn.metrics.accuracy_score(y_test, np.argmax(y_test_pred,axis=1))
                            #     test_for_best_valid_metric = test_metric


                            experiment = {
                                "model": model.name,
                                "graph": graph.graph_name,
                                "num_genes": num_genes,
                                "train_size": args.ntrain,
                                "seed": args.seed,
                                "acc": valid_metric,
                                'lr': sdict["lr"], 
                                'channels': sdict["channels"], 
                                'embedding': sdict["embedding"], 
                                'num_layer': sdict["num_layer"], 
                                'prepool_extralayers': sdict["prepool_extralayers"]
                            }
                            print(i, num_genes, "This result:",valid_metric, experiment)

                            global results
                            results = record_result(results, experiment, filename)

                    except Exception as e:
                        print(e)
                        logging.error(logging.traceback.format_exc())


                    # cleanup
                    model.best_model = None  
                    del model
                    torch.cuda.empty_cache()
                    


    print("#Final Results", test_for_best_valid_metric, best_config)
    return test_for_best_valid_metric, best_config