def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading drugs...") self.drug_to_targets = load_drug_targets(params["drug_targets_path"])
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) self.params["method_params"]["dir"] = dir self.method = globals()[self.params["method_class"]](self.network, self.diseases_dict, self.params["method_params"])
def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh'])
def __init__(self, dir, params): """ Initialize the disease protein prediction experiment Args: dir (string) The directory where the experiment should be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(dir, 'experiment.log'), level=logging.INFO, console=True) # log Title logging.info("Node set expansion evaluation") logging.info( "Sabri Eyuboglu, Marinka Zitnik and Jure Leskovec -- SNAP Group") logging.info("======================================") # load data from params file logging.info("Loading Network...") self.network = Network(self.params["ppi_network"], remove_nodes=self.params.get("remove_nodes", 0), remove_edges=self.params.get("remove_edges", 0)) logging.info("Loading Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) # load method self.params["method_params"]["dir"] = dir self.method = globals()[self.params["method_class"]]( self.network, self.diseases_dict, self.params["method_params"])
def _run(self): """ Run the experiment. """ logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading PPI Matrices...") self.ppi_matrices = load_network_matrices(self.params["ppi_matrices"], self.network) logging.info("Building Degree Buckets...") self.degree_to_bucket = build_degree_buckets(self.network, min_len=self.params["min_bucket_len"]) logging.info("Running Experiment...") self.results = [] if self.params["n_processes"] > 1: with tqdm(total=len(self.diseases)) as t: p = Pool(self.params["n_processes"]) for results in p.imap(process_disease_wrapper, self.diseases.values()): self.results.append(results) t.update() else: with tqdm(total=len(self.diseases)) as t: for disease in self.diseases.values(): results = self.process_disease(disease) self.results.append(results) t.update() self.results = pd.DataFrame(self.results)
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading enrichment study...") obodag = GODag(self.params["go_path"]) geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606]) self.enrichment_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, log=None, **self.params["enrichment_params"]) logging.info("Loading predictions...") self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items()} outputs_path = os.path.join(self.dir, "outputs.pkl") if os.path.exists(outputs_path): logging.info("Loading outputs...") with open(outputs_path, 'rb') as f: self.outputs = pickle.load(f) else: self.outputs = {}
def __init__(self, dir, params): """ """ super().__init__(dir, params) logging.info("Network Matrix Builder") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading PPI Network...") self.network = Network(params["ppi_network"]) self.deg_fn = params["deg_fn"] self.col_norm = params["col_norm"] self.row_norm = params["row_norm"] self.self_loops = params["self_loops"]
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Metric Significance of Diseases in the PPI Network") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading Predictions...") self.method_to_preds = { name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items() } logging.info("Loading Protein Data...") self.field_to_protein_data = { field: load_mapping(path=config["path"], **config["args"]) for field, config in self.params["field_to_protein_data"].items() }
def _run(self): """ Run the experiment. """ logging.info("Loading network...") network = Network(self.params["ppi_network"]) logging.info("Loading molecule associations...") associations = {} for association_path in self.params["association_paths"]: dct = load_diseases(association_path) associations.update(dct) association_matrix, _ = build_disease_matrix(associations, network) association_jaccard = compute_jaccard(association_matrix.T) mi_matrix = mi_matrix = load_network_matrices( {"mi": self.params["mi_dir"]}, network=network)["mi"] mi_values = mi_matrix[np.triu_indices(mi_matrix.shape[0], k=1)] adj_values = network.adj_matrix[np.triu_indices( network.adj_matrix.shape[0], k=1)] jaccard_values = association_jaccard[np.triu_indices( association_jaccard.shape[0], k=1)] k = adj_values.sum().astype(int) statistic, pvalue = ttest_rel( jaccard_values[np.argpartition(mi_values, -k)[-k:]], jaccard_values[np.argpartition(adj_values, -k)[-k:]], ) metrics = { "test": "ttest_rel", "statistic": statistic, "pvalue": pvalue, "mi_mean": jaccard_values[np.argpartition(mi_values, -k)[-k:]].mean(), "adj_mean": jaccard_values[np.argpartition(adj_values, -k)[-k:]].mean(), } with open(os.path.join(self.dir, "results.json"), "w") as f: json.dump(metrics, f, indent=4)
def _run(self): """ Run the experiment. """ logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading PPI Matrices...") self.ppi_matrices = load_network_matrices( self.params["ppi_matrices"], self.network ) logging.info("Building Degree Buckets...") self.degree_to_bucket = build_degree_buckets( self.network, min_len=self.params["min_bucket_len"] ) logging.info("Running Experiment...") self.results = [] self.indices = [] if self.params["n_processes"] > 1: with tqdm(total=len(self.diseases)) as t: p = Pool(self.params["n_processes"]) for indices, results in p.imap( process_disease_wrapper, self.diseases.values() ): if indices is None: continue self.indices.extend(indices) self.results.extend(results) t.update() else: with tqdm(total=len(self.diseases)) as t: for disease in self.diseases.values(): indices, results = self.process_disease(disease) if indices is None: continue self.indices.extend(indices) self.results.extend(results) t.update() index = pd.MultiIndex.from_tuples(self.indices, names=["disease", "protein"]) self.results = pd.DataFrame(self.results, index=index)
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
class DrugTarget(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading drugs...") self.drug_to_targets = load_drug_targets(params["drug_targets_path"]) def compute_drug_counts(self): """ Get np.array num_drugs where num_drugs[u] gives the count of drugs that target node u. """ protein_to_drug_count = Counter() for drug, targets in self.drug_to_targets.items(): for target in targets: protein_to_drug_count[target] += 1 node_to_drug_count = {self.network.get_node(protein): count for protein, count in protein_to_drug_count.items() if protein in self.network} num_drugs = np.zeros(len(self.network)) num_drugs[list(node_to_drug_count)] = list(node_to_drug_count.values()) self.drug_counts = np.array(num_drugs) def compute_weight_stats(self, proteins=None): """ """ if proteins is None: proteins = np.arange(len(self.network)) return { "mean": np.mean(self.ci_weights_norm[proteins]), "median": np.median(self.ci_weights_norm[proteins]), "std": np.std(self.ci_weights_norm[proteins]) } def compute_frac_targets(self, proteins=None): """ """ if proteins is None: proteins = np.arange(len(self.network)) return np.mean((self.drug_counts > 0)[proteins]) def frac_targets_ks_test(self, proteins_a, proteins_b): targets_a = (self.drug_counts > 0)[proteins_a] targets_b = (self.drug_counts > 0)[proteins_b] return ks_2samp(targets_a, targets_b) def _run(self): """ Run the experiment. """ results = {"norm_weight": {}, "frac_targets": {}} self.compute_drug_counts() target_proteins = np.where(self.drug_counts != 0) not_target_proteins = np.where(self.drug_counts == 0) results["norm_weight"]["all"] = self.compute_weight_stats() results["norm_weight"]["target"] = self.compute_weight_stats(target_proteins) results["norm_weight"]["not_target"] = self.compute_weight_stats(not_target_proteins) top_proteins = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:] bottom_proteins = np.argsort(self.ci_weights_norm)[:-self.params["top_k"]] results["frac_targets"]["top"] = self.compute_frac_targets(top_proteins) results["frac_targets"]["bottom"] = self.compute_frac_targets(bottom_proteins) results["frac_targets"]["pvalue"] = self.frac_targets_ks_test(top_proteins, bottom_proteins).pvalue with open(os.path.join(self.dir, "results.json"), 'w') as f: json.dump(results, f, indent=4) def plot_drug_weight_dist(self, protein_sets, save="weight_dist.pdf"): """ """ weights = np.minimum(1.0, self.ci_weights_norm) prepare_sns(sns, kwargs={"font_scale": 1.4, "rc": {'figure.figsize':(6, 4)}}) for name, proteins in protein_sets.items(): sns.distplot(weights[proteins], kde=False, hist=True, norm_hist=True, bins=25, hist_kws={"range":(-0.25, 1.1), "alpha": 0.8}, label=name) sns.despine() plt.xscale('linear') plt.yscale('log') plt.legend() plt.xlabel(r"Degree-normalized LCI weight, $\frac{w_z}{\sqrt{d_z}}$") plt.ylabel("Density") plt.tight_layout() if save is not None: plt.savefig(os.path.join(self.dir, "_figures", save)) def plot_frac_drug_weight(self, protein_set, save="frac_weight.pdf"): num_bins = 22 weights = np.minimum(1.1, self.ci_weights_norm) drug_hist, drug_bins = np.histogram(weights[protein_set], range=(-0.3, 1.5), bins=num_bins) all_hist, all_bins = np.histogram(weights, range=(-0.3, 1.5), bins=num_bins) #plt.bar(drug_bins[:-1], drug_hist) assert(np.all(drug_bins == all_bins)) frac_hist = drug_hist / all_hist plt.bar(x=drug_bins[:-1], height=frac_hist, width=(drug_bins[1] - drug_bins[0]))
class EssentialGeneAnalysis(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading essential genes...") self.essential_proteins = load_essential_proteins(params["essential_genes_path"]) self.essential_nodes = self.network.get_nodes(self.essential_proteins) self.non_essential_nodes = [node for node in self.network.get_nodes() if node not in self.essential_nodes] self.essential_array = np.zeros(len(self.network)) self.essential_array[self.essential_nodes] = 1 def compute_weight_stats(self, nodes=None, norm=True): """ """ weights = self.ci_weights_norm if norm else self.ci_weights if nodes is None: nodes = np.arange(len(self.network)) return { "mean": np.mean(weights[nodes]), "median": np.median(weights[nodes]), "std": np.std(weights[nodes]) } def compute_frac_essential(self, nodes): """ """ return np.mean(self.essential_array[nodes]) def plot_weight_dist(self, node_sets): """ """ for name, nodes in node_sets.items(): sns.distplot(self.ci_weights[nodes], kde=False, hist=True, norm_hist=True, bins=15, hist_kws={"range":(-0.4, 0.8)}, label=name) plt.xscale('linear') plt.yscale('linear') plt.legend() plt.xlabel(r"$\frac{w_k}{\sqrt{d_k}}$") plt.ylabel("# of proteins [normalized]")
class FunctionalEnrichmentAnalysis(Experiment): """ """ def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh']) def run_study(self): """ """ top_nodes = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:] top_proteins = self.network.get_names(top_nodes) self.raw_results = self.go_study.run_study(set(top_proteins)) def to_csv(self): """ """ self.results = [] for r in self.raw_results: self.results.append({ "name": r.name, "pvalue": r.p_fdr_bh, "goterm_id": r.goterm.id }) self.results = sorted(self.results, key = lambda x: x["pvalue"]) results_df = pd.DataFrame(self.results) results_df.to_csv(os.path.join(self.dir, "all_terms.csv"))
class DiseaseSubgraph(Experiment): """ Class for running experiment that assess the significance of a network metric between disease proteins. Uses the method described in Guney et al. for generating random subgraph. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Metric Significance of Diseases in the PPI Network") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading Predictions...") self.method_to_preds = { name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items() } logging.info("Loading Protein Data...") self.field_to_protein_data = { field: load_mapping(path=config["path"], **config["args"]) for field, config in self.params["field_to_protein_data"].items() } def compute_disease_subgraph(self, disease): """ Get the disease subgraph of Args: disease: (Disease) A disease object """ node_to_roles = {} disease_nodes = disease.to_node_array(self.network) for disease_node in disease_nodes: node_to_roles[disease_node] = "disease" disease_node_to_nbrs = { node: set(self.network.nx.neighbors(node)) for node in disease_nodes } for method, preds in self.method_to_preds.items(): top_pred_proteins = set( map( int, preds.loc[disease.id].sort_values( ascending=False).index[:self.params["num_preds"]])) top_pred_nodes = self.network.get_nodes(top_pred_proteins) for pred_node in top_pred_nodes: if pred_node not in node_to_roles: node_to_roles[pred_node] = f"pred_{method}" pred_nbrs = set(self.network.nx.neighbors(pred_node)) for disease_node in disease_nodes: disease_nbrs = disease_node_to_nbrs[disease_node] common_nbrs = disease_nbrs & pred_nbrs for common_nbr in common_nbrs: if common_nbr not in node_to_roles: node_to_roles[common_nbr] = f"common_pred_{method}" # the set of nodes intermediate between nodes in the for a, node_a in enumerate(disease_nodes): for b, node_b in enumerate(disease_nodes): # avoid repeat pairs if a >= b: continue common_nbrs = disease_node_to_nbrs[ node_a] & disease_node_to_nbrs[node_b] for common_nbr in common_nbrs: if common_nbr not in node_to_roles: node_to_roles[common_nbr] = "common_disease" # get induced subgraph subgraph = self.network.nx.subgraph(node_to_roles.keys()) return subgraph, node_to_roles def write_subgraph(self, disease, node_to_roles, subgraph, delimiter='\t'): """ """ directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(directory): os.makedirs(directory) with open(os.path.join(directory, f"subgraph_{disease.id}.txt"), "w") as f: f.write(delimiter.join(["node_1", "node_2", "roles"]) + '\n') for edge in subgraph.edges(): items = [str(edge[0]), str(edge[1])] # dd interaction type roles = node_to_roles[edge[0]] + "-" + node_to_roles[edge[1]] items.append(roles) f.write(delimiter.join(items) + '\n') def write_protein_data(self, disease, node_to_roles): """ """ directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(directory): os.makedirs(directory) protein_data = [] for node, roles in node_to_roles.items(): protein_id = self.network.get_names([node])[0] node_dict = { "node_id": node, "protein_id": protein_id, "role": roles, "degree": self.network.nx.degree(node) } for field, data in self.field_to_protein_data.items(): if not ("weight" in field and "common" not in roles): node_dict[field] = data.get(protein_id, "") protein_data.append(node_dict) df = pd.DataFrame(protein_data) df = df.set_index('node_id') df.to_csv(os.path.join(directory, f"data_{disease.id}.csv")) def process_disease(self, disease): """ Generates null model for disease and computes Args: disease (Disease) the current disease """ subgraph, node_to_roles = self.compute_disease_subgraph(disease) disease_directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(disease_directory): os.makedirs(disease_directory) self.write_subgraph(disease, node_to_roles, subgraph) self.write_protein_data(disease, node_to_roles) def _run(self): """ Run the experiment. """ logging.info("Running Experiment...") self.results = [] if self.params["n_processes"] > 1: with tqdm(total=len(self.diseases)) as t: p = Pool(self.params["n_processes"]) for results in p.imap(process_disease_wrapper, self.diseases.values()): self.results.append(results) t.update() else: with tqdm(total=len(self.diseases)) as t: for disease in self.diseases.values(): results = self.process_disease(disease) self.results.append(results) t.update() self.results = pd.DataFrame(self.results)
class EvaluateMethod(Experiment): """ Class for the disease protein prediction experiment """ def __init__(self, dir, params): """ Initialize the disease protein prediction experiment Args: dir (string) The directory where the experiment should be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(dir, 'experiment.log'), level=logging.INFO, console=True) # log Title logging.info("Node set expansion evaluation") logging.info( "Sabri Eyuboglu, Marinka Zitnik and Jure Leskovec -- SNAP Group") logging.info("======================================") # load data from params file logging.info("Loading Network...") self.network = Network(self.params["ppi_network"], remove_nodes=self.params.get("remove_nodes", 0), remove_edges=self.params.get("remove_edges", 0)) logging.info("Loading Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) # load method self.params["method_params"]["dir"] = dir self.method = globals()[self.params["method_class"]]( self.network, self.diseases_dict, self.params["method_params"]) def _run(self): """ Run the disease protein prediction experiment Args: dir (string) The directory where the experiment should be run """ logging.info("Running Experiment...") disease_to_metrics, disease_to_ranks = {}, {} diseases = list(self.diseases_dict.values()) diseases.sort(key=lambda x: x.split) if self.params["n_processes"] > 1: p = Pool(self.params["n_processes"]) with tqdm(total=len(self.diseases_dict)) as t: for n_finished, (disease, metrics, ranks) in enumerate( p.imap(run_dpp_wrapper, diseases), 1): if metrics != None or ranks != None: disease_to_ranks[disease] = ranks disease_to_metrics[disease] = metrics t.set_postfix(str="{} Recall-at-100: {:.2f}%".format( disease.id, 100 * metrics["Recall-at-100"])) else: t.set_postfix(str="{} Not Recorded".format(disease.id)) t.update() else: with tqdm(total=len(self.diseases_dict)) as t: for n_finished, disease in enumerate(diseases): disease, metrics, ranks = self.run_dpp(disease) if metrics != None or ranks != None: disease_to_metrics[disease] = metrics disease_to_ranks[disease] = ranks t.set_postfix(str="{} Recall-at-100: {:.2f}%".format( disease.id, 100 * metrics["Recall-at-100"])) else: t.set_postfix(str="{} Not Recorded".format(disease.id)) t.update() self.results = { "metrics": disease_to_metrics, "ranks": disease_to_ranks } def compute_node_scores(self, train_nodes, disease): """ Get score Args: disease: (Disease) A disease object """ scores = self.method.compute_scores(train_nodes, disease) return scores def run_dpp(self, disease): """ Perform k-fold cross validation on disease protein prediction on disease Args: disease: (Disease) A disease object """ disease_nodes = disease.to_node_array(self.network) # Ensure that there are at least 2 proteins if disease_nodes.size <= 1: return disease, None, None labels = np.zeros((len(self.network), 1)) labels[disease_nodes, 0] = 1 metrics = {} # Perform k-fold cross validation n_folds = (disease_nodes.size if (self.params["n_folds"] < 0 or self.params["n_folds"] > len(disease_nodes)) else self.params["n_folds"]) kf = KFold(n_splits=n_folds, shuffle=False) for train_indices, test_indices in kf.split(disease_nodes): train_nodes = disease_nodes[train_indices] val_nodes = disease_nodes[test_indices] # compute node scores scores = self.compute_node_scores(train_nodes, disease) # compute the metrics of target node compute_metrics(metrics, labels, scores, train_nodes, val_nodes) avg_metrics = { name: np.mean(values) for name, values in metrics.items() } proteins = self.network.get_names(metrics["Nodes"]) ranks = metrics["Ranks"] proteins_to_ranks = { protein: ranks for protein, ranks in zip(proteins, ranks) } return disease, avg_metrics, proteins_to_ranks def save_results(self): write_metrics(self.dir, self.results["metrics"]) write_ranks(self.dir, self.results["ranks"])
class GOEnrichment(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading enrichment study...") obodag = GODag(self.params["go_path"]) geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606]) self.enrichment_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, log=None, **self.params["enrichment_params"]) logging.info("Loading predictions...") self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items()} outputs_path = os.path.join(self.dir, "outputs.pkl") if os.path.exists(outputs_path): logging.info("Loading outputs...") with open(outputs_path, 'rb') as f: self.outputs = pickle.load(f) else: self.outputs = {} def run_study(self, proteins): """ """ results = self.enrichment_study.run_study(proteins) term_to_pval = {r.goterm.name: r.p_fdr_bh for r in results} return term_to_pval def compute_spearman_correlation(self, a_term_to_pval, b_term_to_pval): """ """ terms = list(a_term_to_pval.keys()) sp_corr, sp_pval = spearmanr([a_term_to_pval[term] for term in terms], [b_term_to_pval[term] for term in terms]) return sp_corr, sp_pval def process_disease(self, disease): """ """ results = {} output = {} # compute method scores for disease disease_proteins = set(self.diseases_dict[disease.id].proteins) if disease.id in self.outputs: disease_term_to_pval = self.outputs[disease.id]["disease"] else: disease_term_to_pval = self.run_study(disease_proteins) output["disease"] = disease_term_to_pval disease_terms = set([term for term, pval in disease_term_to_pval.items() if pval < 0.05]) top_disease_terms = set([term for term, _ in sorted(disease_term_to_pval.items(), key=lambda x: x[1])[:self.params["top_k"]]]) results = {"disease_name": disease.name, "disease_num_significant": len(disease_terms), "disease_top_{}".format(self.params['top_k']): top_disease_terms} # number of predictions to be made num_preds = (len(disease_proteins) if self.params["num_preds"] == -1 else self.params["num_preds"]) for name, preds in self.method_to_preds.items(): pred_proteins = set(map(int, preds.loc[disease.id] .sort_values(ascending=False) .index[:num_preds])) if disease.id in self.outputs: pred_term_to_pval = self.outputs[disease.id][name] else: pred_term_to_pval = self.run_study(pred_proteins) output[name] = pred_term_to_pval pred_terms = set([term for term, pval in pred_term_to_pval.items() if pval < 0.05]) top_pred_terms = set([term for term, _ in sorted(pred_term_to_pval.items(), key=lambda x: x[1])[:self.params["top_k"]]]) jaccard = (len(disease_terms & pred_terms) / len(disease_terms | pred_terms) if len(disease_terms | pred_terms) != 0 else 0) sp_corr, sp_pval = self.compute_spearman_correlation(disease_term_to_pval, pred_term_to_pval) results[f"{name}_num_significant"] = len(pred_terms) results[f"{name}_top_{self.params['top_k']}"] = top_pred_terms results[f"{name}_jaccard_sim"] = jaccard results[f"{name}_sp_corr"] = sp_corr results[f"{name}_sp_pval"] = sp_pval return disease, results, output def _run(self): """ Run the experiment. """ results = [] indices = [] outputs = {} diseases = list(self.diseases_dict.values()) diseases.sort(key=lambda x: x.split) if self.params["n_processes"] > 1: with tqdm(total=len(diseases)) as t: p = Pool(self.params["n_processes"]) for disease, result, output in p.imap(process_disease_wrapper, diseases): results.append(result) indices.append(disease.id) outputs[disease.id] = output t.update() else: with tqdm(total=len(diseases)) as t: for disease in diseases: disease, result, output = self.process_disease(disease) results.append(result) indices.append(disease.id) outputs[disease.id] = output t.update() self.outputs = outputs self.results = pd.DataFrame(results, index=indices) def save_results(self, summary=True): """ Saves the results to a csv using a pandas Data Fram """ print("Saving Results...") self.results.to_csv(os.path.join(self.dir, 'results.csv')) #if self.params["save_enrichment_results"]: # with open(os.path.join(self.dir,'outputs.pkl'), 'wb') as f: # pickle.dump(self.outputs, f) def load_results(self): """ Loads the results from a csv to a pandas Data Frame. """ print("Loading Results...") self.results = pd.read_csv(os.path.join(self.dir, 'results.csv'))
class DPPPredict(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) self.params["method_params"]["dir"] = dir self.method = globals()[self.params["method_class"]](self.network, self.diseases_dict, self.params["method_params"]) def process_disease(self, disease): """ """ # compute method scores for disease disease_nodes = disease.to_node_array(self.network) scores = self.method.compute_scores(disease_nodes, disease) # zero out scores for disease_nodes scores[disease_nodes] = 0 results = {self.network.get_names([node])[0]: score for node, score in enumerate(scores)} return disease, results def _run(self): """ Run the experiment. """ results = [] indices = [] diseases = list(self.diseases_dict.values()) diseases.sort(key=lambda x: x.split) if self.params["n_processes"] > 1: with tqdm(total=len(diseases)) as t: p = Pool(self.params["n_processes"]) for disease, result in p.imap(process_disease_wrapper, diseases): results.append(result) indices.append(disease.id) t.update() else: with tqdm(total=len(diseases)) as t: for disease in diseases: disease, result = self.process_disease(disease) results.append(result) indices.append(disease.id) t.update() self.results = pd.DataFrame(results, index=indices) def save_results(self, summary=True): """ Saves the results to a csv using a pandas Data Fram """ print("Saving Results...") self.results.to_csv(os.path.join(self.dir, 'predictions.csv')) def load_results(self): """ Loads the results from a csv to a pandas Data Frame. """ print("Loading Results...") self.results = pd.read_csv(os.path.join(self.dir, 'predictions.csv'))