Beispiel #1
0
    def get_expression_of_parent_proteins(self,
                                          non_mapped_dval: float = -1
                                          ) -> pd.DataFrame:
        """return a table containing the expression of the proteins inferred in the current experiment from the current tissue.
		This method need internet connection as it need to access uniprot mapping API to map uniprot IDs to gene IDs.  

		:param non_mapped_dval: A default value to be added incase the parent protein is not define in the expression database, defaults to -1
		:type non_mapped_dval: float, optional
		:return: a table that contain the expression of the protein inferred in the database 
		:rtype: pd.DataFrame
		"""
        proteins: List[str] = list(self.get_proteins())
        print(
            f"Mapping Uniprot accession to ENSEMBLE IDs ..., starting at: {time.ctime()}"
        )
        map2Ensemble: pd.DataFrame = map_from_uniprot_gene(proteins)
        # allocate a list to hold the expression values
        expression: List[float] = []
        print(
            f"Computing the expression of parent proteins ..., starting at: {time.ctime()}"
        )
        for prot in tqdm(proteins):
            # get  all transcripts that map to the protein -> port
            temp_df: pd.DataFrame = map2Ensemble.loc[map2Ensemble.iloc[:, 0] ==
                                                     prot]
            # if one-to-one mapping is returned
            if temp_df.shape[0] == 1:
                # we try to extract the expression value of the protein
                try:
                    expression.append(self._tissue.get_expression_profile(
                    ).get_gene_id_expression(temp_df.iloc[0, 1]))
                except KeyError:
                    expression.append(non_mapped_dval)
            else:  # we have more than one mapping
                temp_ens_ids: List[str] = temp_df.iloc[:, 1].tolist()
                temp_res_raw: List[int] = []
                for ens_id in temp_ens_ids:
                    try:
                        temp_res_raw.append(
                            self._tissue.get_expression_profile(
                            ).get_gene_id_expression(ens_id))
                    except KeyError:
                        temp_res_raw.append(non_mapped_dval)
                # filter out default value
                temp_res_pross: List[int] = [
                    elem for elem in temp_res_raw if elem != non_mapped_dval
                ]
                # if the list is empty, all the transcript can not be mapped
                if len(temp_res_pross) == 0:
                    expression.append(non_mapped_dval)
                else:
                    expression.append(np.mean(temp_res_pross))
        # construct the dataframe
        results: pd.DataFrame = pd.DataFrame({
            'Proteins': proteins,
            'Expression': expression
        })
        return results
Beispiel #2
0
	def get_main_sub_cellular_location_of_parent_proteins(self, not_mapped_val: str = 'UNK')->pd.DataFrame:
		"""retrun the main cellular location for the identified proteins.
		This method need internet connection as it need to access uniprot mapping API to map uniprot IDs to gene IDs.  


		:param not_mapped_val: The default value to return incase the location of a protein can not be extracted, defaults to 'UNK'
		:type not_mapped_val: str, optional
		:return: A table that contain the main cellular compartment for each protein in the current instance.
		:rtype: pd.DataFrame
		"""
		proteins: List[str] = list(self.get_proteins())
		map2Ensemble: pd.DataFrame = map_from_uniprot_gene(proteins)
		# allocate a list to hold the main location
		main_locations: List[str] = []
		for prot in proteins:
			# we get a pandas dataframe that contain all the ensemble ids belonging to this protein.  
			temp_df: pd.DataFrame = map2Ensemble.loc[map2Ensemble.iloc[:,0]==prot]
			if temp_df.shape[0]==1: 
				try:
					main_locations.append(';'.join(self._tissue.get_subCellular_locations().get_main_location(temp_df.iloc[0,1])))
				except KeyError: 
					main_locations.append(not_mapped_val)
			else: 
				temp_ens_ids: List[str] = temp_df.iloc[:,1].tolist()
				temp_res_raw: List[str] = []
				for ens_id in temp_ens_ids:
					try: 
						temp_res_raw.append(';'.join(self._tissue.get_subCellular_locations().get_main_location(ens_id)))
					except KeyError: 
						temp_res_raw.append(not_mapped_val)
				# filter out default value 
				temp_res_pross: List[int] = [elem for elem in temp_res_raw if elem != not_mapped_val]
				# if the list is empty, all the proteins can not be mapped 
				if len(temp_res_pross)==0:
					main_locations.append(not_mapped_val)
				else: 
					# get a set of the unique location from different mapping 
					temp_unique_poss: Set[str] = set()
					for elem in temp_res_pross: 
						for loc in elem.split(';'):
							temp_unique_poss.add(loc)
					# append the results into one string and add it to the database elements 		
					main_locations.append(';'.join(temp_unique_poss))
		# construct the dataframe 
		results: pd.DataFrame= pd.DataFrame({'Proteins':proteins, 'Main_locations':main_locations})
		return results
Beispiel #3
0
def compute_expression_correlation(exp1: Experiment,
                                   exp2: Experiment) -> float:
    """compute the correlation in the gene expression between two experiments by constructing a union
    of all the proteins expressed in the first and second experiments, extract the gene expression 
    of these genes and then compute the correlation using SciPy stat module. 
    
    :param exp1: The first experimental object 
    :type exp1: Experiment
    :param exp2: The second experimental object 
    :type exp2: Experiment
    :return: the correlation in gene expression of the proteins inferred in the provided pair of experiment
    :rtype: float
    """
    # get the expression tables
    protein_exp1: Set[str] = set(exp1.get_proteins())
    protein_exp2: Set[str] = set(exp2.get_proteins())
    unique_proteins = list(protein_exp1.union(protein_exp2))
    # get the gene id
    prot2Ense: pd.DataFrame = map_from_uniprot_gene(unique_proteins)
    # allocate lists to hold the results
    gene_expression_exp1: List[float] = []
    gene_expression_exp2: List[float] = []
    # get the expression from experiment one
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp1.append(
                    exp1.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp1.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp1.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average if all values have been filtered
            if len(temp_gene_process) == 0:
                gene_expression_exp1.append(-1)
            else:
                gene_expression_exp1.append(np.mean(temp_gene_process))
    # ge the expression from exp2:
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp2.append(
                    exp2.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp2.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp2.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average
            if len(temp_gene_process) == 0:
                gene_expression_exp2.append(-1)
            else:
                gene_expression_exp2.append(np.mean(temp_gene_process))
    # compute construct a dataframe
    temp_paired_exp_df: pd.DataFrame = pd.DataFrame({
        'exp2':
        gene_expression_exp1,
        'exp1':
        gene_expression_exp2
    })
    # filter the un-mapped from exp1
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 0] != -1, ]
    # filter the unmapped from exp2
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 1] != -1, ]

    # compute the correlation
    return pearsonr(temp_paired_exp_df.iloc[:, 0],
                    temp_paired_exp_df.iloc[:, 1])[0]
Beispiel #4
0
    def get_go_location_id_parent_proteins(self,
                                           not_mapped_val: str = 'UNK'
                                           ) -> pd.DataFrame:
        """retrun the gene ontology,GO, location terms for all the identified proteins. 

		:param not_mapped_val: The default value to return incase the GO term of the protein can not be extracted, defaults to 'UNK'
		:type not_mapped_val: str, optional
		:return: A table that contain the GO-location term for each protein in the current instance.
		:rtype: pd.DataFrame
		"""
        proteins: List[str] = list(self.get_proteins())
        print(
            f"Mapping Uniprot accession to ENSEMBLE IDs ..., starting at: {time.ctime()}"
        )
        map2Ensemble: pd.DataFrame = map_from_uniprot_gene(proteins)
        print(
            f"Getting the GO Subcellular compartment of parent proteins ..., starting at: {time.ctime()}"
        )
        #allocate a list to hold the go terms
        go_terms: List[str] = []
        for prot in tqdm(proteins):
            # we get a pandas dataframe that contain all the ensemble ids belonging to this protein.
            temp_df: pd.DataFrame = map2Ensemble.loc[map2Ensemble.iloc[:, 0] ==
                                                     prot]
            if temp_df.shape[0] == 1:
                try:
                    go_terms.append(';'.join(
                        self._tissue.get_subCellular_locations().get_go_names(
                            temp_df.iloc[0, 1])))
                except KeyError:
                    go_terms.append(not_mapped_val)
            else:
                temp_ens_ids: List[str] = temp_df.iloc[:, 1].tolist()
                temp_res_raw: List[str] = []
                for ens_id in temp_ens_ids:
                    try:
                        temp_res_raw.append(
                            ';'.join(self._tissue.get_subCellular_locations().
                                     get_go_names(ens_id)))
                    except KeyError:
                        temp_res_raw.append(not_mapped_val)
                # filter out default value
                temp_res_pross: List[int] = [
                    elem for elem in temp_res_raw if elem != not_mapped_val
                ]
                # if the list is empty, all the proteins can not be mapped
                if len(temp_res_pross) == 0:
                    go_terms.append(not_mapped_val)
                else:
                    # get a set of the unique location from different mapping
                    temp_unique_poss: Set[str] = set()
                    for elem in temp_res_pross:
                        for loc in elem.split(';'):
                            temp_unique_poss.add(loc)
                    # append the results into one string and add it to the database elements
                    go_terms.append(';'.join(temp_unique_poss))
        # construct the dataframe
        results: pd.DataFrame = pd.DataFrame({
            'Proteins': proteins,
            'GO_Terms': go_terms
        })
        return results