def compute_protein_coverage(experiment1:Experiment,experiment2:Experiment,progress_bar:bool=True)->Dict[str,Dict[str,np.ndarray]]: """ Compute the difference in protein coverage among the two input experiments Args: experiment1 (Experiment): The first experiment containing the protein and peptides derived from the first HLA-set experiment2 (Experiment): The second experiment containing the protein and peptides derived from the second HLA-set progress_bar (bool): A boolean flag for controlling the progress bar, if true, a progress bar is shown, defaults to True. Returns: Dict[str,Dict[str,np.ndarray]]: Returns a nested dict containing protein identifiers as a keys and a dict as a value, the dict contain two arrays as values,\ the first contain protein coverage in the first HLA-Set and the second contain the coverage in the second HLA-set.\ The results dictionary only contain coverage for proteins observed in the two sets. """ protein_experiment_one:List[str]=experiment1.get_proteins() protein_experiment_two:List[str]=experiment2.get_proteins() present_in_both:List[str]=protein_experiment_one.intersection(protein_experiment_two) results:Dict[str,Dict[str,np.ndarray]]=dict() if progress_bar: for protein in present_in_both: temp_dict={ '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein), '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein) } results.update({protein:temp_dict}) else: for protein in present_in_both: temp_dict={ '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein), '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein) } results.update({protein:temp_dict}) return results
def get_binnary_protein_overlap(exp1:Experiment, exp2:Experiment)->Proteins: """compare the protein overlap between two experimental objects. :param exp1: an instance of class Experiment. :type exp1: Experiment :param exp2: an instance of class Experiment. :type exp2: Experiment :return: a list of proteins that have been identified or inferred in both experiments. :rtype: Proteins """ protein_one=exp1.get_proteins() protein_two=exp2.get_proteins() return list(protein_one.intersection(protein_two))
def compute_jaccard_index(exp1:Experiment,exp2:Experiment, level:str='peptide')->float: """Compute Jaccard index between samples two samples Args: exp1 (Experiment): The first experimental instance exp2 (Experiment): The first experimental instance level (str): The level of computing the overlap between samples, can be any of peptide or protein Returns: float: Jaccard index computed with regard to the to provide level """ if level != 'peptide' and level != 'protein': raise ValueError(f"Level: {level} is not supported, currently only level, peptide and protein are supported") if level=='peptide': return (len(exp1.get_peptides().intersection(exp2.get_peptides())) / len(exp1.get_peptides().union(exp2.get_peptides()))) if level=='protein': return (len(exp1.get_proteins().intersection(exp2.get_proteins())) / len(exp1.get_proteins().union(exp2.get_proteins())))
def compute_expression_correlation(exp1: Experiment, exp2: Experiment) -> float: """compute the correlation in the gene expression between two experiments by constructing a union of all the proteins expressed in the first and second experiments, extract the gene expression of these genes and then compute the correlation using SciPy stat module. :param exp1: The first experimental object :type exp1: Experiment :param exp2: The second experimental object :type exp2: Experiment :return: the correlation in gene expression of the proteins inferred in the provided pair of experiment :rtype: float """ # get the expression tables protein_exp1: Set[str] = set(exp1.get_proteins()) protein_exp2: Set[str] = set(exp2.get_proteins()) unique_proteins = list(protein_exp1.union(protein_exp2)) # get the gene id prot2Ense: pd.DataFrame = map_from_uniprot_gene(unique_proteins) # allocate lists to hold the results gene_expression_exp1: List[float] = [] gene_expression_exp2: List[float] = [] # get the expression from experiment one for prot in unique_proteins: temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot] if temp_df.shape[0] == 1: # we got only one match gene_id: str = temp_df['Gene-ID'].tolist()[0] try: gene_expression_exp1.append( exp1.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene_id)) except KeyError: gene_expression_exp1.append(-1) else: temp_gene_expression: List[float] = [] for gene in temp_df.iloc[:, 1].tolist(): try: temp_gene_expression.append( exp1.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene)) except KeyError: temp_gene_expression.append(-1) # filter the temp_genes for default value temp_gene_process: List[str] = [ elem for elem in temp_gene_expression if elem != -1 ] # add the gene expression as the average if all values have been filtered if len(temp_gene_process) == 0: gene_expression_exp1.append(-1) else: gene_expression_exp1.append(np.mean(temp_gene_process)) # ge the expression from exp2: for prot in unique_proteins: temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot] if temp_df.shape[0] == 1: # we got only one match gene_id: str = temp_df['Gene-ID'].tolist()[0] try: gene_expression_exp2.append( exp2.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene_id)) except KeyError: gene_expression_exp2.append(-1) else: temp_gene_expression: List[float] = [] for gene in temp_df.iloc[:, 1].tolist(): try: temp_gene_expression.append( exp2.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene)) except KeyError: temp_gene_expression.append(-1) # filter the temp_genes for default value temp_gene_process: List[str] = [ elem for elem in temp_gene_expression if elem != -1 ] # add the gene expression as the average if len(temp_gene_process) == 0: gene_expression_exp2.append(-1) else: gene_expression_exp2.append(np.mean(temp_gene_process)) # compute construct a dataframe temp_paired_exp_df: pd.DataFrame = pd.DataFrame({ 'exp2': gene_expression_exp1, 'exp1': gene_expression_exp2 }) # filter the un-mapped from exp1 temp_paired_exp_df = temp_paired_exp_df.loc[ temp_paired_exp_df.iloc[:, 0] != -1, ] # filter the unmapped from exp2 temp_paired_exp_df = temp_paired_exp_df.loc[ temp_paired_exp_df.iloc[:, 1] != -1, ] # compute the correlation return pearsonr(temp_paired_exp_df.iloc[:, 0], temp_paired_exp_df.iloc[:, 1])[0]