Example #1
0
def compute_protein_coverage(experiment1:Experiment,experiment2:Experiment,progress_bar:bool=True)->Dict[str,Dict[str,np.ndarray]]:
    """ Compute the difference in protein coverage among the two input experiments

    Args:
        experiment1 (Experiment): The first experiment containing the protein and peptides derived from the first HLA-set 
        experiment2 (Experiment): The second experiment containing the protein and peptides derived from the second HLA-set
        progress_bar (bool): A boolean flag for controlling the progress bar, if true, a progress bar is shown, defaults to True. 

    Returns:
        Dict[str,Dict[str,np.ndarray]]: Returns a nested dict containing protein identifiers as a keys and a dict as a value, the dict contain two arrays as values,\
            the first contain protein coverage in the first HLA-Set and the second contain the coverage in the second HLA-set.\
                  The results dictionary only contain coverage for proteins observed in the two sets. 
    """
    protein_experiment_one:List[str]=experiment1.get_proteins()
    protein_experiment_two:List[str]=experiment2.get_proteins()
    present_in_both:List[str]=protein_experiment_one.intersection(protein_experiment_two)
    results:Dict[str,Dict[str,np.ndarray]]=dict()
    if progress_bar:
        for protein in present_in_both:
            temp_dict={
                '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein),
                '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein)
            }
            results.update({protein:temp_dict})
    else:
        for protein in present_in_both:
            temp_dict={
                '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein),
                '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein)
            }
            results.update({protein:temp_dict})
    return results
Example #2
0
def get_binnary_protein_overlap(exp1:Experiment, exp2:Experiment)->Proteins:
    """compare the protein overlap between two experimental objects.

    :param exp1: an instance of class Experiment.
    :type exp1: Experiment
    :param exp2: an instance of class Experiment.
    :type exp2: Experiment
    :return: a list of proteins that have been identified or inferred in both experiments. 
    :rtype: Proteins
    """
    protein_one=exp1.get_proteins()
    protein_two=exp2.get_proteins()
    return list(protein_one.intersection(protein_two))
Example #3
0
def get_binnary_peptide_overlap(exp1:Experiment, exp2:Experiment)->Peptides:
    """compare the peptide overlap between two experimental objects.

    :param exp1: an instance of class Experiment.
    :type exp1: Experiment
    :param exp2: an instance of class Experiment.
    :type exp2: Experiment
    :return: a list of peptides that have been identified in both experiments.
    :rtype: Peptides
    """
    peptide_one=exp1.get_peptides()
    peptide_two=exp2.get_peptides()
    return list(peptide_one.intersection(peptide_two))
Example #4
0
    def load_data(self, exp: Experiment, num_proteins: int = -1) -> None:
        """Load the data to the Engine, so GOEA can be conducted 

        Args:
            exp (Experiment): An Experimental object to extract uniprot ids 
            num_proteins (int, optional): The number of proteins to be included in the analysis. Defaults -1 to which mean use all proteins,\
                 otherwise it uses the number of proteins provided by the user. note that the function is sorted by number of peptides per protein,\
                      that is the first 10 protein means, getting the top 10 protein with most peptides. 
        Raises:
            ValueError: if the function called while data being already associated with the engine from a previous call
        """
        if self._gene_ids is not None:
            raise ValueError(
                f"There some data still in the engine, the first 10 genes are: {','.join(self._gene_ids[:10])}\
                clean your engine from previous data using the function, clean_engine and try again."
            )
        print(
            f"Getting the number of peptide per protein ..., started at: {time.ctime()}"
        )
        num_protein_per_peptides = exp.get_peptides_per_protein()
        if num_proteins == -1:
            list_proteins = num_protein_per_peptides.iloc[:, 0].to_list()
        else:
            list_proteins = num_protein_per_peptides.iloc[:, 0].to_list(
            )[:num_proteins]
        print(
            f"Map uniprot to Entrez gene ids ..., starting at: {time.ctime()}")
        self._gene_ids = [
            int(gene_id) for gene_id in map_from_uniprot_to_Entrez_Gene(
                list_proteins).iloc[:, 1].to_list()
        ]
        print(f"{len(self._gene_ids)} Genes have been correctly loaded")
        return
Example #5
0
def simulate_random_experiment(alleles: List[str],
                               path2fasta: str,
                               tissue_name: str = 'TEST_TISSUE',
                               num_pep: int = 10,
                               num_prot: int = 5,
                               proband_name: str = None) -> Experiment:
    """ Simulate a random experiment objects 

    :param alleles: a list of alleles names. 
    :type alleles: List[str]
    :param path2fasta: The path to load the database objects 
    :type path2fasta: str
    :param tissue_name: the name of the tissue, defaults to 'TEST_TISSUE'
    :type tissue_name: str, optional
    :param num_pep: the number of peptides in the table, defaults to 10
    :type num_pep: int, optional
    :param num_prot: number of proteins, defaults to 5
    :type num_prot: int, optional
    :param proband_name: The name of the Proband, defaults to None
    :type proband_name: str, optional
    :return: A simulated experimental object 
    :rtype: Experiment
    """
    if proband_name is None:
        proband_name = generate_random_name(12)

    proband: Proband = Proband(name=proband_name)
    hla_set: HLASet = HLASet(alleles)
    ident_table: pd.DataFrame = simulate_an_experimental_ident_table_from_fasta(
        path2fasta, num_pep, num_pep)
    # to be upgraded to the new version of the Tissue class
    tissue: Tissue = Tissue(tissue_name,
                            simulate_an_expression_table(num_transcripts=1000),
                            simulate_an_expression_table(num_transcripts=100))
    database: SeqDB = SeqDB(path2fasta)
    return Experiment(proband=proband,
                      hla_set=hla_set,
                      tissue=tissue,
                      database=database,
                      ident_table=ident_table)
Example #6
0
def compute_jaccard_index(exp1:Experiment,exp2:Experiment, level:str='peptide')->float:
    """Compute Jaccard index between samples two samples 

    Args:
        exp1 (Experiment): The first experimental instance 
        exp2 (Experiment): The first experimental instance 
        level (str): The level of computing the overlap between samples, can be any of peptide or protein 

    Returns:
        float: Jaccard index computed with regard to the to provide level
    """
    if level != 'peptide' and level != 'protein': 
        raise ValueError(f"Level: {level} is not supported, currently only level, peptide and protein are supported")
    if level=='peptide':
        return (len(exp1.get_peptides().intersection(exp2.get_peptides())) / len(exp1.get_peptides().union(exp2.get_peptides())))
    if level=='protein':
        return (len(exp1.get_proteins().intersection(exp2.get_proteins())) / len(exp1.get_proteins().union(exp2.get_proteins())))
Example #7
0
    def __init__(
            self,
            filepath: str,
            path2fasta: str,
            fileformat: str = 'idXML',
            tissue_name: str = 'total PMBC',
            proband_name: str = 'Default Proband',
            hla_set: List[str] = ['DRB1*15:01', 'DRB1*15:01']) -> cExperimet:
        """A Wrapper class for constracting an experimental dataset using user defined parameters\
            The class take care of initializing all classes and functions provided an easy-to-use interface\
            for working with immunopeptidomics data 
        Args:
            filepath (str): the path to load the input file, for example and idXML or an Identification table 
            path2fasta (str): the path to load Fasta database 
            fileformat (str, optional): type of input format, can be any of idXML, pepXML, mzTab or a CSV Table.\
                 Defaults to 'idXML'.
            tissue_name (str, optional): The name of the tissue to utilize, this is used for initializing the gene expression table\
                 Defaults to 'total PMBC'.
            proband_name (str, optional): the name of the proband from whome the data was obtained. Defaults to 'Default Proband'.
            hla_set (List[str], optional): A list of HLA alleles from whome the data was obtained. Defaults to ['DRB1*15:01','DRB1*15:01'].

        Returns:
            cExperimet: an IPTK.Class.Wrapper.Experiment class, an IPTK.Class.Experiment.Experiment can be extracted from the resutned instance using the get_experiment method 
        """
        ## Checking that the input is correct
        if not os.path.exists(filepath):
            raise ValueError(
                f"The provided path for the identification file : {filepath} does not exist!!"
            )
        if not os.path.exists(path2fasta):
            raise ValueError(
                f"The path to the proivded fasta file: {path2fasta}, does not exists!!!"
            )
        if fileformat not in ['idXML', 'pepXML', 'csv', 'mzTab']:
            raise ValueError(
                f"Unknow input format, the provided format: {fileformat} is not supported, currently supported values are: {', '.join(['idXML', 'pepXML', 'IdTable','mzTab'])}"
            )
        # define the data
        self._proband = Proband(name=proband_name)  # the name of the proband
        try:
            self._hLASet = HLASet(
                hlas=['HLA-DRB1*15:01']
            )  # just a place holder to represent the HLA allele, an instance of class HLASet
        except Exception as exp:
            raise RuntimeError(
                f"The following error was Encountered while creating an HLASet: \n{str(exp)}\n"
            )
        try:
            self._seqBD = SeqDB(path2fasta=path2fasta)
        except Exception as exp:
            raise IOError(
                f"While loading the fasta database the following error was Encountered : \n{str(exp)}\n"
            )
        self._expresson_profile = GeneExpressionDB(
        )  # use the data on the human protein atlas @https://www.proteinatlas.org/about/download --> Normal tissue data
        self._protein_locations = CellularLocationDB(
        )  # use the data on the human protein atlas @https://www.proteinatlas.org/about/download --> Subcellular location data
        try:
            self._tissue = Tissue(name='small intestine',
                                  main_exp_value=self._expresson_profile,
                                  main_location=self._protein_locations
                                  )  # create the tissue instance
        except Exception as exp:
            raise RuntimeError(
                f"While creating a tissue instance, the following error was Encountered: \n{str(exp)}\n"
            )
        try:
            if fileformat == 'idXML':
                input_table = parse_xml_based_format_to_identification_table(
                    path2XML_file=filepath,
                    path2fastaDB=path2fasta,
                    is_idXML=True)
            elif fileformat == 'pepXML':
                input_table = parse_xml_based_format_to_identification_table(
                    path2XML_file=filepath,
                    path2fastaDB=path2fasta,
                    is_idXML=False)
            elif fileformat == 'mzTab':
                input_table = parse_mzTab_to_identification_table(
                    path2mzTab=filepath, path2fastaDB=path2fasta)
            else:
                input_table = parse_text_table(filepath, path2fasta)
        except Exception as exp:
            raise ValueError(
                f"Loading the input table has caused to the following error: \n{str(exp)}\n"
            )
        # constructing the experiments
        try:
            self._exp = Experiment(self._proband, self._hLASet, self._tissue,
                                   self._seqBD, input_table)
        except Exception as exp:
            raise RuntimeError(
                f"Generating an Experiment instance has caused to the following error: \n{str(exp)}\n"
            )
        self._cashed_results = dict()
        return
Example #8
0
def compute_expression_correlation(exp1: Experiment,
                                   exp2: Experiment) -> float:
    """compute the correlation in the gene expression between two experiments by constructing a union
    of all the proteins expressed in the first and second experiments, extract the gene expression 
    of these genes and then compute the correlation using SciPy stat module. 
    
    :param exp1: The first experimental object 
    :type exp1: Experiment
    :param exp2: The second experimental object 
    :type exp2: Experiment
    :return: the correlation in gene expression of the proteins inferred in the provided pair of experiment
    :rtype: float
    """
    # get the expression tables
    protein_exp1: Set[str] = set(exp1.get_proteins())
    protein_exp2: Set[str] = set(exp2.get_proteins())
    unique_proteins = list(protein_exp1.union(protein_exp2))
    # get the gene id
    prot2Ense: pd.DataFrame = map_from_uniprot_gene(unique_proteins)
    # allocate lists to hold the results
    gene_expression_exp1: List[float] = []
    gene_expression_exp2: List[float] = []
    # get the expression from experiment one
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp1.append(
                    exp1.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp1.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp1.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average if all values have been filtered
            if len(temp_gene_process) == 0:
                gene_expression_exp1.append(-1)
            else:
                gene_expression_exp1.append(np.mean(temp_gene_process))
    # ge the expression from exp2:
    for prot in unique_proteins:
        temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot]
        if temp_df.shape[0] == 1:  # we got only one match
            gene_id: str = temp_df['Gene-ID'].tolist()[0]
            try:
                gene_expression_exp2.append(
                    exp2.get_tissue().get_expression_profile(
                    ).get_gene_id_expression(gene_id=gene_id))
            except KeyError:
                gene_expression_exp2.append(-1)
        else:
            temp_gene_expression: List[float] = []
            for gene in temp_df.iloc[:, 1].tolist():
                try:
                    temp_gene_expression.append(
                        exp2.get_tissue().get_expression_profile(
                        ).get_gene_id_expression(gene_id=gene))
                except KeyError:
                    temp_gene_expression.append(-1)
            # filter the temp_genes for default value
            temp_gene_process: List[str] = [
                elem for elem in temp_gene_expression if elem != -1
            ]
            # add the gene expression as the average
            if len(temp_gene_process) == 0:
                gene_expression_exp2.append(-1)
            else:
                gene_expression_exp2.append(np.mean(temp_gene_process))
    # compute construct a dataframe
    temp_paired_exp_df: pd.DataFrame = pd.DataFrame({
        'exp2':
        gene_expression_exp1,
        'exp1':
        gene_expression_exp2
    })
    # filter the un-mapped from exp1
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 0] != -1, ]
    # filter the unmapped from exp2
    temp_paired_exp_df = temp_paired_exp_df.loc[
        temp_paired_exp_df.iloc[:, 1] != -1, ]

    # compute the correlation
    return pearsonr(temp_paired_exp_df.iloc[:, 0],
                    temp_paired_exp_df.iloc[:, 1])[0]
Example #9
0
def create_experiment(table_format, tissue_name, hla_alleles, n_clicks):
    if n_clicks > 0:
        if PEPTIDE_TABLE_PATH is None:
            return "ERROR: The peptide identification file has not been uploaded"
        if FASTA_DATABASE_PATH is None:
            return "ERROR: The sequence database identification has not been uploaded"
        # try to load the peptide table
        try:
            if table_format == 'pepXML':
                table_pep: pd.DataFrame = inFunc.parse_xml_based_format_to_identification_table(
                    path2XML_file=PEPTIDE_TABLE_PATH,
                    path2fastaDB=FASTA_DATABASE_PATH,
                    is_idXML=False)
            elif table_format == 'idXML':
                table_pep: pd.DataFrame = inFunc.parse_xml_based_format_to_identification_table(
                    path2XML_file=PEPTIDE_TABLE_PATH,
                    path2fastaDB=FASTA_DATABASE_PATH,
                    is_idXML=True)
            elif table_format == 'mzTab':
                table_pep: pd.DataFrame = inFunc.parse_mzTab_to_identification_table(
                    path2mzTab=PEPTIDE_TABLE_PATH,
                    path2fastaDB=FASTA_DATABASE_PATH,
                )
            else:
                table_pep: pd.DataFrame = inFunc.parse_text_table(
                    path2file=PEPTIDE_TABLE_PATH,
                    path2fastaDB=FASTA_DATABASE_PATH,
                    sep=',')
        except Exception as exp:
            return f'ERROR:: While parsing the identification table, the following error was encountered: {exp} '
        # create a proband
        proband: Proband = Proband(name='UI_PROBAND')
        # create an sequence database
        try:
            seqs: SeqDB = SeqDB(path2fasta=FASTA_DATABASE_PATH)
        except Exception as exp:
            return f'ERROR:: While creating the sequence database: the following error was encountered; {exp}'
        # create the OrgDB
        try:
            org_db: OrganismDB = OrganismDB(FASTA_DATABASE_PATH)
        except Exception as exp:
            return f'ERROR:: While creating the sequence database: the following error was encountered; {exp}'
        # create the expression profile
        if GENE_EXPRESSION_TABLE is None:
            try:
                expresson_profile: GeneExpressionDB = GeneExpressionDB(
                    path2data=
                    'https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip',
                    sep='\t')
            except Exception as exp:
                return f'While Downloading the online table the following error was encountered: {exp}'
        else:
            try:
                expresson_profile: GeneExpressionDB = GeneExpressionDB(
                    path2data=GENE_EXPRESSION_TABLE, sep='\t')
            except Exception as exp:
                return f'While parsing the expression table, the following error was encountered: {exp}'
        # create the location table
        if PROTEIN_LOC_TABLE is None:
            try:
                protein_locations: CellularLocationDB = CellularLocationDB(
                    path2data=
                    'https://www.proteinatlas.org/download/subcellular_location.tsv.zip',
                    sep='\t')
            except Exception as exp:
                return f'While Downloading the protein sub-cellular location table, the following error was encountered: {exp}'
        else:
            try:
                protein_locations: CellularLocationDB = CellularLocationDB(
                    path2data=PROTEIN_LOC_TABLE, sep='\t')
            except Exception as exp:
                return f'While parsing the location table, the following error was encountered: {exp}'
        # create the tissue instance
        tissue: Tissue = Tissue(name=tissue_name,
                                main_exp_value=expresson_profile,
                                main_location=protein_locations)
        # create the hla_set
        hlas: HLASet = HLASet(hlas=hla_alleles.split(';'))
        # create the experiment object
        global experiment
        try:
            experiment = Experiment(proband=proband,
                                    hla_set=hlas,
                                    tissue=tissue,
                                    database=seqs,
                                    ident_table=table_pep)
        except Exception as exp:
            return f'while creating an experimental object,the following error was encounter {exp}'
        # annoatate the experiment
        experiment.annotate_proteins(org_db)
        # return the experiment output
        return str(experiment)