Ejemplo n.º 1
0
class EssentialGeneAnalysis(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)

                
        logging.info("Loading essential genes...")
        self.essential_proteins = load_essential_proteins(params["essential_genes_path"])
        self.essential_nodes = self.network.get_nodes(self.essential_proteins)
        self.non_essential_nodes = [node for node in self.network.get_nodes()
                                    if node not in self.essential_nodes]
        
        self.essential_array = np.zeros(len(self.network))
        self.essential_array[self.essential_nodes] = 1

        
    def compute_weight_stats(self, nodes=None, norm=True):
        """
        """
        weights = self.ci_weights_norm if norm else self.ci_weights
        if nodes is None:
            nodes = np.arange(len(self.network))
        return {
            "mean": np.mean(weights[nodes]),
            "median": np.median(weights[nodes]),
            "std": np.std(weights[nodes])
        }

    def compute_frac_essential(self, nodes): 
        """
        """
        return np.mean(self.essential_array[nodes])
         
        
    def plot_weight_dist(self, node_sets):
        """
        """
        for name, nodes in node_sets.items():
            sns.distplot(self.ci_weights[nodes], 
                 kde=False, hist=True, norm_hist=True, bins=15, 
                 hist_kws={"range":(-0.4, 0.8)}, label=name)

        plt.xscale('linear')
        plt.yscale('linear')
        plt.legend()
        plt.xlabel(r"$\frac{w_k}{\sqrt{d_k}}$")
        plt.ylabel("# of proteins [normalized]")
        
Ejemplo n.º 2
0
class DiseaseSubgraph(Experiment):
    """
    Class for running experiment that assess the significance of a network metric
    between disease proteins. Uses the method described in Guney et al. for generating
    random subgraph. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'),
                   level=logging.INFO,
                   console=True)

        # Log title
        logging.info("Metric Significance of Diseases in the PPI Network")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        logging.info("Loading Disease Associations...")
        self.diseases = load_diseases(self.params["associations_path"],
                                      self.params["disease_subset"],
                                      exclude_splits=['none'])

        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"])

        logging.info("Loading Predictions...")
        self.method_to_preds = {
            name: pd.read_csv(os.path.join(preds, "predictions.csv"),
                              index_col=0)
            for name, preds in self.params["method_to_preds"].items()
        }

        logging.info("Loading Protein Data...")
        self.field_to_protein_data = {
            field: load_mapping(path=config["path"], **config["args"])
            for field, config in self.params["field_to_protein_data"].items()
        }

    def compute_disease_subgraph(self, disease):
        """ Get the disease subgraph of 
        Args:
            disease: (Disease) A disease object
        """
        node_to_roles = {}
        disease_nodes = disease.to_node_array(self.network)
        for disease_node in disease_nodes:
            node_to_roles[disease_node] = "disease"

        disease_node_to_nbrs = {
            node: set(self.network.nx.neighbors(node))
            for node in disease_nodes
        }

        for method, preds in self.method_to_preds.items():
            top_pred_proteins = set(
                map(
                    int, preds.loc[disease.id].sort_values(
                        ascending=False).index[:self.params["num_preds"]]))
            top_pred_nodes = self.network.get_nodes(top_pred_proteins)

            for pred_node in top_pred_nodes:
                if pred_node not in node_to_roles:
                    node_to_roles[pred_node] = f"pred_{method}"
                pred_nbrs = set(self.network.nx.neighbors(pred_node))
                for disease_node in disease_nodes:
                    disease_nbrs = disease_node_to_nbrs[disease_node]
                    common_nbrs = disease_nbrs & pred_nbrs
                    for common_nbr in common_nbrs:
                        if common_nbr not in node_to_roles:
                            node_to_roles[common_nbr] = f"common_pred_{method}"

        # the set of nodes intermediate between nodes in the
        for a, node_a in enumerate(disease_nodes):
            for b, node_b in enumerate(disease_nodes):
                # avoid repeat pairs
                if a >= b:
                    continue
                common_nbrs = disease_node_to_nbrs[
                    node_a] & disease_node_to_nbrs[node_b]
                for common_nbr in common_nbrs:
                    if common_nbr not in node_to_roles:
                        node_to_roles[common_nbr] = "common_disease"

        # get induced subgraph
        subgraph = self.network.nx.subgraph(node_to_roles.keys())

        return subgraph, node_to_roles

    def write_subgraph(self, disease, node_to_roles, subgraph, delimiter='\t'):
        """
        """
        directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(os.path.join(directory, f"subgraph_{disease.id}.txt"),
                  "w") as f:
            f.write(delimiter.join(["node_1", "node_2", "roles"]) + '\n')
            for edge in subgraph.edges():
                items = [str(edge[0]), str(edge[1])]

                # dd interaction type
                roles = node_to_roles[edge[0]] + "-" + node_to_roles[edge[1]]
                items.append(roles)

                f.write(delimiter.join(items) + '\n')

    def write_protein_data(self, disease, node_to_roles):
        """
        """
        directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(directory):
            os.makedirs(directory)

        protein_data = []
        for node, roles in node_to_roles.items():
            protein_id = self.network.get_names([node])[0]
            node_dict = {
                "node_id": node,
                "protein_id": protein_id,
                "role": roles,
                "degree": self.network.nx.degree(node)
            }

            for field, data in self.field_to_protein_data.items():
                if not ("weight" in field and "common" not in roles):
                    node_dict[field] = data.get(protein_id, "")
            protein_data.append(node_dict)

        df = pd.DataFrame(protein_data)
        df = df.set_index('node_id')
        df.to_csv(os.path.join(directory, f"data_{disease.id}.csv"))

    def process_disease(self, disease):
        """
        Generates null model for disease and computes 
        Args:
            disease (Disease) the current disease 
        """
        subgraph, node_to_roles = self.compute_disease_subgraph(disease)

        disease_directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(disease_directory):
            os.makedirs(disease_directory)

        self.write_subgraph(disease, node_to_roles, subgraph)
        self.write_protein_data(disease, node_to_roles)

    def _run(self):
        """
        Run the experiment.
        """

        logging.info("Running Experiment...")
        self.results = []

        if self.params["n_processes"] > 1:
            with tqdm(total=len(self.diseases)) as t:
                p = Pool(self.params["n_processes"])
                for results in p.imap(process_disease_wrapper,
                                      self.diseases.values()):
                    self.results.append(results)
                    t.update()
        else:
            with tqdm(total=len(self.diseases)) as t:
                for disease in self.diseases.values():
                    results = self.process_disease(disease)
                    self.results.append(results)
                    t.update()
        self.results = pd.DataFrame(self.results)