Ejemplo n.º 1
0
def testAAS():
    """
    @tests:
     AASequence.__init__
     AASequence.__add__
     AASequence.__radd__
     AASequence.__iadd__
     AASequence.getCTerminalModification
     AASequence.getNTerminalModification
     AASequence.setCTerminalModification
     AASequence.setModification
     AASequence.setNTerminalModification
     AASequence.setStringSequence
     AASequence.toString
     AASequence.toUnmodifiedString
    """
    aas = pyopenms.AASequence()

    aas + aas
    aas += aas

    aas.__doc__
    aas = pyopenms.AASequence("DFPIANGER")
    assert aas.getCTerminalModification() == ""
    assert aas.getNTerminalModification() == ""
    aas.setCTerminalModification("")
    aas.setNTerminalModification("")
    aas.setStringSequence("")
    assert aas.toString() == ""
    assert aas.toUnmodifiedString() == ""
Ejemplo n.º 2
0
def testPeptideHit():
    """
    @tests:
     PeptideHit.__init__
     PeptideHit.addProteinAccession
     PeptideHit.clearMetaInfo
     PeptideHit.getAAAfter
     PeptideHit.getAABefore
     PeptideHit.getKeys
     PeptideHit.getMetaValue
     PeptideHit.getProteinAccessions
     PeptideHit.getRank
     PeptideHit.getScore
     PeptideHit.getSequence
     PeptideHit.isMetaEmpty
     PeptideHit.metaValueExists
     PeptideHit.removeMetaValue
     PeptideHit.setAAAfter
     PeptideHit.setAABefore
     PeptideHit.setCharge
     PeptideHit.setMetaValue
     PeptideHit.setProteinAccessions
     PeptideHit.setRank
     PeptideHit.setScore
     PeptideHit.setSequence
     PeptideHit.__eq__
     PeptideHit.__ge__
     PeptideHit.__gt__
     PeptideHit.__le__
     PeptideHit.__lt__
     PeptideHit.__ne__
    """
    ph = pyopenms.PeptideHit()
    assert ph == ph
    assert not ph != ph

    ph = pyopenms.PeptideHit(1.0, 1, 0, pyopenms.AASequence("A"))
    _testMetaInfoInterface(ph)
    ph.addProteinAccession("A")
    assert ph.getProteinAccessions() == ["A"]

    assert ph.getScore() == 1.0
    assert ph.getRank() == 1
    assert ph.getSequence().toString() == "A"

    ph.setScore(2.0)
    assert ph.getScore() == 2.0
    ph.setRank(30)
    assert ph.getRank() == 30
    ph.setSequence(pyopenms.AASequence("AAA"))
    assert ph.getSequence().toString() == "AAA"

    ph.setAABefore('B')
    assert ph.getAABefore() == "B"
    ph.setAAAfter('C')
    assert ph.getAAAfter() == 'C'

    assert ph == ph
    assert not ph != ph
Ejemplo n.º 3
0
def get_mass_diff(seq1, seq2):
    """
    Compute the mass difference for both sequences.
    """
    # full peptide, charge = 0
    m1 = oms.AASequence(seq1).getMonoWeight(0, 0)
    m2 = oms.AASequence(seq2).getMonoWeight(0, 0)
    return(np.abs(m1-m2))
Ejemplo n.º 4
0
def annotate(spectra, identifications):
    # TODO: Currently idXML returns index based spectrum reference instead of
    # title based spectrum reference
    # new_spectra = {}
    for spectrum_index, spectrum in enumerate(spectra, 1):
        spectrum_id = spectrum.identifier
        if spectrum_index not in identifications:
            # TODO: what if no ids was found?
            continue
        identification = identifications[spectrum_index]
        sequence = pyopenms.AASequence().fromString(identification)
        modifications = {}
        # import pdb; pdb.set_trace()
        sequence.size()
        for residue_index in range(sequence.size()):
            residue = sequence.getResidue(residue_index)
            if residue.isModified():
                delta_mass = residue.getModification().getDiffMonoMass()
                modifications[residue_index] = delta_mass
        spectrum.peptide = sequence.toUnmodifiedString().decode()
        spectrum.modifications = modifications
        # new_spectrum = sus.MsmsSpectrum(
        #     spectrum.identifier,
        #     spectrum.precursor_mz,
        #     spectrum.precursor_charge,
        #     spectrum.mz,
        #     spectrum.intensity,
        #     peptide=sequence.toUnmodifiedString().decode(),
        #     modifications=modifications
        # )
        # new_spectra[spectrum_id] = new_spectrum
    return spectra
Ejemplo n.º 5
0
    def simplisticBinnedScoring(self, phit, spectrum):
        """Simplistic phospho-scoring of a spectrum against a peptide hit.

        This function enumerates all possible locations for the
        phosphorylation and computes a similarity score between the
        experimental spectrum and the theoretical spectrum for each
        possibility.
        """
        seq = phit.getSequence().toString()
        seq = seq.replace("(Phospho)", "")
        possibilities = []
        spectrum_b = self.binSpectrum(spectrum)
        charge = 1
        # Iterate over all possible phosphosites
        for m in re.finditer("[STY]", seq):
            new_sequence = seq[:m.start() + 1] + "(Phospho)" + seq[m.start() -
                                                                   1:]
            new_aaseq = pyopenms.AASequence(new_sequence)
            # Generate theoretical spectrum
            spectrum_generator = pyopenms.TheoreticalSpectrumGenerator()
            rs = pyopenms.RichMSSpectrum()
            try:
                spectrum_generator.addPeaks(rs, new_aaseq,
                                            pyopenms.Residue.ResidueType.YIon,
                                            charge)
                spectrum_generator.addPeaks(rs, new_aaseq,
                                            pyopenms.Residue.ResidueType.BIon,
                                            charge)
            except AttributeError:
                # 1.11
                spectrum_generator.addPeaks(rs, new_aaseq,
                                            pyopenms.ResidueType.YIon, charge)
                spectrum_generator.addPeaks(rs, new_aaseq,
                                            pyopenms.ResidueType.BIon, charge)
            theor = convertToMSSpectrum(rs)
            theor_b = self.binSpectrum(theor)
            # Compare theoretical spectrum to experimental spectrum
            comp_score = self.compare_binnedSpectra(spectrum_b, theor_b)
            possibilities.append([comp_score, new_aaseq])

        # Sort the result by score, return the best scoring result
        possibilities.sort(lambda x, y: -cmp(x[0], y[0]))
        return possibilities[0]
Ejemplo n.º 6
0
    def test_spectrum(self):

        intensity = [100, 100, 100, 100, 100, 100, 100]
        mz = [
            #// four of the naked b/y ions
            #// as well as one of the modified b and y ions ion each
            350.17164,  #// b
            421.20875,  #// b
            421.20875 + 79.9657,  #// b + P
            547.26291,  #// y
            646.33133,  #// y
            809.39466 + 79.9657  #// y + P
        ]

        spectrum = pyopenms.Spectrum()
        spectrum.setMZArray(mz)
        spectrum.setIntensityArray(intensity)

        diascoring = pyopenms.DIAScoring()
        diascoring.set_dia_parameters(
            0.05, False, 30, 50, 4, 4
        )  #; // here we use a large enough window so that none of our peaks falls out
        a = pyopenms.AASequence("SYVAWDR")

        bseries_score = 0.0
        yseries_score = 0.0
        charge = 1
        bseries_score, yseries_score = diascoring.dia_by_ion_score(
            spectrum, a, charge, bseries_score, yseries_score)

        self.assertAlmostEqual(bseries_score, 2.0)
        self.assertAlmostEqual(yseries_score, 2.0)

        # // now add a modification to the sequence
        a.setModification(1, "Phospho")  #; // modify the Y
        bseries_score = 0
        yseries_score = 0
        bseries_score, yseries_score = diascoring.dia_by_ion_score(
            spectrum, a, 1, bseries_score, yseries_score)

        self.assertAlmostEqual(bseries_score, 1.0)
        self.assertAlmostEqual(yseries_score, 3.0)
Ejemplo n.º 7
0
def testPeptideIdentification():
    """
    @tests:
     PeptideIdentification.__init__
     PeptideIdentification.assignRanks
     PeptideIdentification.clearMetaInfo
     PeptideIdentification.empty
     PeptideIdentification.getHits
     PeptideIdentification.getIdentifier
     PeptideIdentification.getKeys
     PeptideIdentification.getMetaValue
     PeptideIdentification.getNonReferencingHits
     PeptideIdentification.getReferencingHits
     PeptideIdentification.getScoreType
     PeptideIdentification.getSignificanceThreshold
     PeptideIdentification.insertHit
     PeptideIdentification.isHigherScoreBetter
     PeptideIdentification.isMetaEmpty
     PeptideIdentification.metaValueExists
     PeptideIdentification.removeMetaValue
     PeptideIdentification.setHigherScoreBetter
     PeptideIdentification.setHits
     PeptideIdentification.setIdentifier
     PeptideIdentification.setMetaValue
     PeptideIdentification.setScoreType
     PeptideIdentification.sort
     PeptideIdentification.__eq__
     PeptideIdentification.__ge__
     PeptideIdentification.__gt__
     PeptideIdentification.__le__
     PeptideIdentification.__lt__
     PeptideIdentification.__ne__
     """
    pi = pyopenms.PeptideIdentification()
    _testMetaInfoInterface(pi)
    assert pi == pi
    assert not pi != pi

    ph = pyopenms.PeptideHit(1.0, 1, 0, pyopenms.AASequence("A"))
    pi.insertHit(ph)
    phx, = pi.getHits()
    assert phx == ph

    pi.setHits([ph])
    phx, = pi.getHits()
    assert phx == ph

    assert isinstance(pi.getSignificanceThreshold(), float)
    assert isinstance(pi.getScoreType(), str)
    pi.setScoreType("A")
    assert isinstance(pi.isHigherScoreBetter(), int)
    assert isinstance(pi.getIdentifier(), str)
    pi.setIdentifier("id")
    pi.assignRanks()
    pi.sort()
    assert not pi.empty()

    rv = []
    pi.getReferencingHits("A", rv)
    assert rv == []
    pi.getNonReferencingHits("A", rv)
    hit, = rv
    assert hit.getSequence().toString() == "A"
    assert hit.getScore() == 1.0
    assert hit.getRank() == 1

    rv = []
    pi.getReferencingHits(["A"], rv)
    assert rv == []
    pi.getNonReferencingHits(["A"], rv)
    hit, = rv
    assert hit.getSequence().toString() == "A"
    assert hit.getScore() == 1.0
    assert hit.getRank() == 1

    ph = pyopenms.ProteinHit()
    pi.getReferencingHits([ph], rv)
    hit, = rv
    assert hit.getSequence().toString() == "A"
    assert hit.getScore() == 1.0
    assert hit.getRank() == 1
    rv = []
    pi.getNonReferencingHits([ph], rv)
    hit, = rv
    assert hit.getSequence().toString() == "A"
    assert hit.getScore() == 1.0
    assert hit.getRank() == 1
Ejemplo n.º 8
0
def main():
    """
    Main function
    """
    parser = argparse.ArgumentParser(
        prog='bic_CSV.py',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent('''\
         ---------------------------------------------------------------------
         | Description:                                                      |
         ---------------------------------------------------------------------
         Command line tool to convert BICEPS output to a .csv like format.
         The default seperator are tabs ("\\t").

         Explanations for all columns can be found on the github page
         (http://buotex.github.io/BICEPS/doc/html/index.html).

         ---------------------------------------------------------------------
         | Usage:                                                            |
         ---------------------------------------------------------------------
         python BICEPS_TO_CSV.py --infile input.txt --outfile output.csv


         =====================================================================

                 '''))

    parser.add_argument(
        '--in_oms',
        metavar='in_oms',
        type=str,
        help='OpenMS output directory of the TextExporter node.',
        required=True)
    parser.add_argument(
        '--in_bic',
        metavar='in_bic',
        type=str,
        help=
        'Input file/directory of the processed biceps results  (.mut files)',
        required=True)
    parser.add_argument('--in_db',
                        metavar='in_db',
                        type=str,
                        help='protein database used for the --in_bic file',
                        required=True)

    parser.add_argument('--CPV_tab',
                        metavar='CPV_tab',
                        type=str,
                        help='Table derived from CanProVar.',
                        default="")
    parser.add_argument('--map_tab',
                        metavar='map_tab',
                        type=str,
                        help='Ensemble-Uniprot mapping data',
                        default="")
    parser.add_argument('--out_aux',
                        metavar='out_aux',
                        type=str,
                        help='Outputfile for auxillary data files',
                        default="")
    parser.add_argument('--min_diff',
                        metavar='min_diff',
                        type=float,
                        help='minimum mass difference between a AA mutation',
                        default=0.5)

    args = parser.parse_args()
    in_oms = args.in_oms
    in_bic = args.in_bic
    in_db = args.in_db
    CPV_tab = args.CPV_tab
    map_tab = args.map_tab
    out_ax = args.out_aux
    min_diff = args.min_diff

    print ""
    print "###################################################################"
    print "# in_oms: %s" % in_oms
    print "# in_bic: %s" % in_bic
    print "# in_db: %s" % in_db
    print "#"
    print "# out dir: %s" % out_ax
    print "###################################################################"
    print ""

    if not os.path.exists(out_ax):
        print "%s created!" % out_ax
        os.makedirs(out_ax)

    #==========================================================================
    #  start analysis workflow
    #==========================================================================


#        in_oms = "/home/sven/data/BICEPS/HeLa/OMS_results2/TOPPAS_out/011-TextExporter-out/"
#        in_bic = "/home/sven/data/BICEPS/HeLa/results/"
#        in_db = '/home/sven/data/BICEPS/fasta/HS_reviewed_19092014.fasta'
#        CPV_tab = "/home/sven/data/BICEPS/var_dbs/Ensembl54_homo_cancer_dbSNP_variation_protein.fasta"
#        map_tab = "/home/sven/data/BICEPS/var_dbs/ensemble_uniprot_mapping_canprovar.tab"
#        out_ax = "/home/sven/data/BICEPS/HCT/results/HCT1/out/"

    print "load fasta files...",
    canprovar_df = CanProVar_to_table(CPV_tab, out_ax)
    mapping_dictionary = uniprot_mapping(map_tab)
    uniprot_dic = BICl.read_uniprot_to_dic(in_db, mode="seq")
    print "DONE!"
    oms_files = sorted(glob.glob(in_oms + "SAAV_candidates*.csv"))
    bic_files = sorted(glob.glob(in_bic + "*.txt.mut"))

    fdr_c = []
    basename = []
    snp_id = []
    snp_protein = []
    snps_db = []
    ensemble_protein_list = []
    bic_snps = []

    oms_in_vardb = []
    print "CSV\tDB\tOMS_in_BIC\tOMS_input\tOMS_out\tOMS_unique"
    for oms_in, bic_in in zip(oms_files, bic_files):
        db_lists = []
        #datastrucutres to store the mutations that are confirmed
        validated_identifier = []
        validated_sequence = []
        validated_tag = []
        validated_identifier_ens = []
        accepted_oms = []
        mass_diff = []
        temp_snps = []
        basename_oms = os.path.basename(oms_in)
        basename_bic = os.path.basename(bic_in)
        seq_dic, bic_mut_dic, bic_mut_tag_dic = read_biceps_log(bic_in)
        list_peps = []
        c = 0
        print basename_oms,
        print basename_bic,
        oms_results = pd.read_csv(oms_in, sep="\t")
        oms_results["unmod_sequence"] = [
            oms.AASequence(seqi).toUnmodifiedString()
            for seqi in oms_results["sequence"]
        ]
        oms_results["single_uniprot"] = [
            BICl.get_uniprot(acci).replace("_MUT", "")
            for acci in oms_results["accessions"]
        ]

        for seqi, uniproti in zip(oms_results["unmod_sequence"],
                                  oms_results["accessions"]):
            uniprot_ids_all = [
                BICl.get_uniprot(i).replace("_MUT", "")
                for i in uniproti.split(";") if i.count("|") >= 2
            ]
            # test if openms id in biceps results
            if seqi in bic_mut_dic:
                intersection_oms = np.unique(
                    np.intersect1d(uniprot_ids_all, [
                        uniprot_ids_seqi
                        for uniprot_ids_seqi in bic_mut_dic[seqi].keys()
                    ]))
                if len(intersection_oms) >= 1:
                    c += 1
                    accepted_oms.append(True)

                    for item in intersection_oms:
                        tag = create_mutation_tag(item, seq_dic[seqi], seqi,
                                                  uniprot_dic)
                        validated_identifier.append(item)
                        validated_sequence.append(seqi)
                        validated_tag.append(tag)
                        try:
                            validated_identifier_ens.append(
                                mapping_dictionary[item])
                        except:
                            validated_identifier_ens.append("N.A")
                    list_peps.append(seq_dic[seqi].keys())
                    mass_diff.append(
                        BICp.get_mass_diff(seqi, seq_dic[seqi].keys()[0]))
                else:
                    accepted_oms.append(False)
                    list_peps.append("")
                    mass_diff.append(0)
            else:
                accepted_oms.append(False)
                list_peps.append("")
                mass_diff.append(0)

            # test if openms variation DB
            for uniprot_idi in uniprot_ids_all:
                if uniprot_idi in bic_mut_tag_dic:
                    pass
                else:
                    #probably a mutation with 2 residues
                    continue
                if uniprot_idi in mapping_dictionary:
                    ensemble_ids_temp = mapping_dictionary[uniprot_idi][
                        0].split(",")
                    for e_id in ensemble_ids_temp:
                        try:
                            # get more than ONE snp and transform to set
                            gt_snps = canprovar_df.loc[e_id][
                                "native_id"].values
                        except:
                            # get more ONE snp and transform to set
                            gt_snps = [canprovar_df.loc[e_id]["native_id"]]
                            if len(gt_snps) != 1:
                                sys.exit("Wrong with the number of snps...")
                        #compare to BICEPS results
                        intersection = np.intersect1d(
                            adjust_mutation_position(
                                bic_mut_tag_dic[uniprot_idi]), gt_snps)

                        if len(intersection) != 0:
                            db_lists.append(
                                (e_id, ";".join(intersection), uniprot_idi))
                            temp_snps.extend(np.hstack(intersection))
                            snp_id.extend(intersection)
                            snp_protein.append(uniprot_idi)
                            ensemble_protein_list.append(e_id)
                            c += 1
        print c,
        snps_db.append(np.unique(temp_snps))
        fdr_c.append(c)
        basename.append(basename_oms)
        bic_snps.append(len(bic_mut_dic))

        oms_results["isBIC"] = accepted_oms
        oms_results["orig_seq"] = list_peps
        oms_results["mass_diff"] = mass_diff
        print oms_results.shape[0],
        oms_filtered = oms_results[(oms_results["isBIC"] == True)
                                   & (oms_results["mass_diff"] >= min_diff)]
        print oms_filtered.shape[0],
        oms_filtered.to_csv(out_ax + basename_oms +
                            "_peptide_identifications.csv")
        print len(np.unique(oms_filtered["unmod_sequence"]))

        #create Provean data
        validated_df = pd.DataFrame()
        validated_df["identifier_uni"] = validated_identifier
        validated_df["identifier_ens"] = [
            i[0] if len(i) == 1 else i for i in validated_identifier_ens
        ]
        validated_df["sequence"] = validated_sequence
        validated_df["tag"] = validated_tag
        validated_df.sort("identifier_uni", inplace=True)
        validated_df.drop_duplicates(inplace=True)
        write_to_provean_format(validated_df,
                                out_ax + basename_oms + "_provean.csv",
                                mass_filter=min_diff)

        #database variants
        df_vaars = pd.DataFrame()
        df_vaars["ensembl_id"] = [i[0] for i in db_lists]
        df_vaars["mutation_tags"] = [i[1] for i in db_lists]
        df_vaars["uniprot"] = [i[2] for i in db_lists]
        df_vaars = df_vaars.drop_duplicates()
        df_vaars.to_csv(out_ax + basename_oms + "_SNPs_canprovar.csv",
                        sep="\t")

    #summary data frame for all files
    res_df = pd.DataFrame()
    res_df["SNPs_OMS"] = fdr_c
    res_df["SNPs_DB"] = [len(i) for i in snps_db]
    res_df["SNPs_BIC"] = bic_snps
    res_df["OMS_base"] = basename
    res_df["OMS"] = oms_files
    res_df["BIC"] = bic_files
    res_df.to_csv(out_ax + "SNPs_overview.csv", sep="\t")
Ejemplo n.º 9
0
 def score(self, phit, spectrum):
     nr_sites = phit.getSequence().toString().count("Phospho")
     if nr_sites != 1:
         return [-1, pyopenms.AASequence()]
     return self.simplisticBinnedScoring(phit, spectrum)