예제 #1
0
def compute_protein_coverage(experiment1:Experiment,experiment2:Experiment,progress_bar:bool=True)->Dict[str,Dict[str,np.ndarray]]:
    """ Compute the difference in protein coverage among the two input experiments

    Args:
        experiment1 (Experiment): The first experiment containing the protein and peptides derived from the first HLA-set 
        experiment2 (Experiment): The second experiment containing the protein and peptides derived from the second HLA-set
        progress_bar (bool): A boolean flag for controlling the progress bar, if true, a progress bar is shown, defaults to True. 

    Returns:
        Dict[str,Dict[str,np.ndarray]]: Returns a nested dict containing protein identifiers as a keys and a dict as a value, the dict contain two arrays as values,\
            the first contain protein coverage in the first HLA-Set and the second contain the coverage in the second HLA-set.\
                  The results dictionary only contain coverage for proteins observed in the two sets. 
    """
    protein_experiment_one:List[str]=experiment1.get_proteins()
    protein_experiment_two:List[str]=experiment2.get_proteins()
    present_in_both:List[str]=protein_experiment_one.intersection(protein_experiment_two)
    results:Dict[str,Dict[str,np.ndarray]]=dict()
    if progress_bar:
        for protein in present_in_both:
            temp_dict={
                '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein),
                '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein)
            }
            results.update({protein:temp_dict})
    else:
        for protein in present_in_both:
            temp_dict={
                '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein),
                '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein)
            }
            results.update({protein:temp_dict})
    return results
예제 #2
0
def get_binnary_protein_overlap(exp1:Experiment, exp2:Experiment)->Proteins:
    """compare the protein overlap between two experimental objects.

    :param exp1: an instance of class Experiment.
    :type exp1: Experiment
    :param exp2: an instance of class Experiment.
    :type exp2: Experiment
    :return: a list of proteins that have been identified or inferred in both experiments. 
    :rtype: Proteins
    """
    protein_one=exp1.get_proteins()
    protein_two=exp2.get_proteins()
    return list(protein_one.intersection(protein_two))
예제 #3
0
def compute_jaccard_index(exp1:Experiment,exp2:Experiment, level:str='peptide')->float:
    """Compute Jaccard index between samples two samples 

    Args:
        exp1 (Experiment): The first experimental instance 
        exp2 (Experiment): The first experimental instance 
        level (str): The level of computing the overlap between samples, can be any of peptide or protein 

    Returns:
        float: Jaccard index computed with regard to the to provide level
    """
    if level != 'peptide' and level != 'protein': 
        raise ValueError(f"Level: {level} is not supported, currently only level, peptide and protein are supported")
    if level=='peptide':
        return (len(exp1.get_peptides().intersection(exp2.get_peptides())) / len(exp1.get_peptides().union(exp2.get_peptides())))
    if level=='protein':
        return (len(exp1.get_proteins().intersection(exp2.get_proteins())) / len(exp1.get_proteins().union(exp2.get_proteins())))
예제 #4
0
def compute_expression_correlation(exp1: Experiment,
                                   exp2: Experiment) -> float:
    """compute the correlation in the gene expression between two experiments by constructing a union
    of all the proteins expressed in the first and second experiments, extract the gene expression 
    of these genes and then compute the correlation using SciPy stat module. 
    
    :param exp1: The first experimental object 
    :type exp1: Experiment
    :param exp2: The second experimental object 
    :type exp2: Experiment
    :return: the correlation in gene expression of the proteins inferred in the provided pair of experiment
    :rtype: float
    """
    # get the expression tables
    protein_exp1: Set[str] = set(exp1.get_proteins())
    protein_exp2: Set[str] = set(exp2.get_proteins())
    unique_proteins = list(protein_exp1.union(protein_exp2))
    # get the gene id
    prot2Ense: pd.DataFrame = map_from_uniprot_gene(unique_proteins)
    # allocate lists to hold the results
    gene_expression_exp1: List[float] = []
    gene_expression_exp2: List[float] = []
    # get the expression from experiment one
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp1.append(
                    exp1.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp1.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp1.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average if all values have been filtered
            if len(temp_gene_process) == 0:
                gene_expression_exp1.append(-1)
            else:
                gene_expression_exp1.append(np.mean(temp_gene_process))
    # ge the expression from exp2:
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp2.append(
                    exp2.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp2.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp2.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average
            if len(temp_gene_process) == 0:
                gene_expression_exp2.append(-1)
            else:
                gene_expression_exp2.append(np.mean(temp_gene_process))
    # compute construct a dataframe
    temp_paired_exp_df: pd.DataFrame = pd.DataFrame({
        'exp2':
        gene_expression_exp1,
        'exp1':
        gene_expression_exp2
    })
    # filter the un-mapped from exp1
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 0] != -1, ]
    # filter the unmapped from exp2
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 1] != -1, ]

    # compute the correlation
    return pearsonr(temp_paired_exp_df.iloc[:, 0],
                    temp_paired_exp_df.iloc[:, 1])[0]