class EssentialGeneAnalysis(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading essential genes...") self.essential_proteins = load_essential_proteins(params["essential_genes_path"]) self.essential_nodes = self.network.get_nodes(self.essential_proteins) self.non_essential_nodes = [node for node in self.network.get_nodes() if node not in self.essential_nodes] self.essential_array = np.zeros(len(self.network)) self.essential_array[self.essential_nodes] = 1 def compute_weight_stats(self, nodes=None, norm=True): """ """ weights = self.ci_weights_norm if norm else self.ci_weights if nodes is None: nodes = np.arange(len(self.network)) return { "mean": np.mean(weights[nodes]), "median": np.median(weights[nodes]), "std": np.std(weights[nodes]) } def compute_frac_essential(self, nodes): """ """ return np.mean(self.essential_array[nodes]) def plot_weight_dist(self, node_sets): """ """ for name, nodes in node_sets.items(): sns.distplot(self.ci_weights[nodes], kde=False, hist=True, norm_hist=True, bins=15, hist_kws={"range":(-0.4, 0.8)}, label=name) plt.xscale('linear') plt.yscale('linear') plt.legend() plt.xlabel(r"$\frac{w_k}{\sqrt{d_k}}$") plt.ylabel("# of proteins [normalized]")
class DiseaseSubgraph(Experiment): """ Class for running experiment that assess the significance of a network metric between disease proteins. Uses the method described in Guney et al. for generating random subgraph. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Metric Significance of Diseases in the PPI Network") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading Predictions...") self.method_to_preds = { name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items() } logging.info("Loading Protein Data...") self.field_to_protein_data = { field: load_mapping(path=config["path"], **config["args"]) for field, config in self.params["field_to_protein_data"].items() } def compute_disease_subgraph(self, disease): """ Get the disease subgraph of Args: disease: (Disease) A disease object """ node_to_roles = {} disease_nodes = disease.to_node_array(self.network) for disease_node in disease_nodes: node_to_roles[disease_node] = "disease" disease_node_to_nbrs = { node: set(self.network.nx.neighbors(node)) for node in disease_nodes } for method, preds in self.method_to_preds.items(): top_pred_proteins = set( map( int, preds.loc[disease.id].sort_values( ascending=False).index[:self.params["num_preds"]])) top_pred_nodes = self.network.get_nodes(top_pred_proteins) for pred_node in top_pred_nodes: if pred_node not in node_to_roles: node_to_roles[pred_node] = f"pred_{method}" pred_nbrs = set(self.network.nx.neighbors(pred_node)) for disease_node in disease_nodes: disease_nbrs = disease_node_to_nbrs[disease_node] common_nbrs = disease_nbrs & pred_nbrs for common_nbr in common_nbrs: if common_nbr not in node_to_roles: node_to_roles[common_nbr] = f"common_pred_{method}" # the set of nodes intermediate between nodes in the for a, node_a in enumerate(disease_nodes): for b, node_b in enumerate(disease_nodes): # avoid repeat pairs if a >= b: continue common_nbrs = disease_node_to_nbrs[ node_a] & disease_node_to_nbrs[node_b] for common_nbr in common_nbrs: if common_nbr not in node_to_roles: node_to_roles[common_nbr] = "common_disease" # get induced subgraph subgraph = self.network.nx.subgraph(node_to_roles.keys()) return subgraph, node_to_roles def write_subgraph(self, disease, node_to_roles, subgraph, delimiter='\t'): """ """ directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(directory): os.makedirs(directory) with open(os.path.join(directory, f"subgraph_{disease.id}.txt"), "w") as f: f.write(delimiter.join(["node_1", "node_2", "roles"]) + '\n') for edge in subgraph.edges(): items = [str(edge[0]), str(edge[1])] # dd interaction type roles = node_to_roles[edge[0]] + "-" + node_to_roles[edge[1]] items.append(roles) f.write(delimiter.join(items) + '\n') def write_protein_data(self, disease, node_to_roles): """ """ directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(directory): os.makedirs(directory) protein_data = [] for node, roles in node_to_roles.items(): protein_id = self.network.get_names([node])[0] node_dict = { "node_id": node, "protein_id": protein_id, "role": roles, "degree": self.network.nx.degree(node) } for field, data in self.field_to_protein_data.items(): if not ("weight" in field and "common" not in roles): node_dict[field] = data.get(protein_id, "") protein_data.append(node_dict) df = pd.DataFrame(protein_data) df = df.set_index('node_id') df.to_csv(os.path.join(directory, f"data_{disease.id}.csv")) def process_disease(self, disease): """ Generates null model for disease and computes Args: disease (Disease) the current disease """ subgraph, node_to_roles = self.compute_disease_subgraph(disease) disease_directory = os.path.join(self.dir, 'diseases', disease.id) if not os.path.exists(disease_directory): os.makedirs(disease_directory) self.write_subgraph(disease, node_to_roles, subgraph) self.write_protein_data(disease, node_to_roles) def _run(self): """ Run the experiment. """ logging.info("Running Experiment...") self.results = [] if self.params["n_processes"] > 1: with tqdm(total=len(self.diseases)) as t: p = Pool(self.params["n_processes"]) for results in p.imap(process_disease_wrapper, self.diseases.values()): self.results.append(results) t.update() else: with tqdm(total=len(self.diseases)) as t: for disease in self.diseases.values(): results = self.process_disease(disease) self.results.append(results) t.update() self.results = pd.DataFrame(self.results)