def testAAS(): """ @tests: AASequence.__init__ AASequence.__add__ AASequence.__radd__ AASequence.__iadd__ AASequence.getCTerminalModification AASequence.getNTerminalModification AASequence.setCTerminalModification AASequence.setModification AASequence.setNTerminalModification AASequence.setStringSequence AASequence.toString AASequence.toUnmodifiedString """ aas = pyopenms.AASequence() aas + aas aas += aas aas.__doc__ aas = pyopenms.AASequence("DFPIANGER") assert aas.getCTerminalModification() == "" assert aas.getNTerminalModification() == "" aas.setCTerminalModification("") aas.setNTerminalModification("") aas.setStringSequence("") assert aas.toString() == "" assert aas.toUnmodifiedString() == ""
def testPeptideHit(): """ @tests: PeptideHit.__init__ PeptideHit.addProteinAccession PeptideHit.clearMetaInfo PeptideHit.getAAAfter PeptideHit.getAABefore PeptideHit.getKeys PeptideHit.getMetaValue PeptideHit.getProteinAccessions PeptideHit.getRank PeptideHit.getScore PeptideHit.getSequence PeptideHit.isMetaEmpty PeptideHit.metaValueExists PeptideHit.removeMetaValue PeptideHit.setAAAfter PeptideHit.setAABefore PeptideHit.setCharge PeptideHit.setMetaValue PeptideHit.setProteinAccessions PeptideHit.setRank PeptideHit.setScore PeptideHit.setSequence PeptideHit.__eq__ PeptideHit.__ge__ PeptideHit.__gt__ PeptideHit.__le__ PeptideHit.__lt__ PeptideHit.__ne__ """ ph = pyopenms.PeptideHit() assert ph == ph assert not ph != ph ph = pyopenms.PeptideHit(1.0, 1, 0, pyopenms.AASequence("A")) _testMetaInfoInterface(ph) ph.addProteinAccession("A") assert ph.getProteinAccessions() == ["A"] assert ph.getScore() == 1.0 assert ph.getRank() == 1 assert ph.getSequence().toString() == "A" ph.setScore(2.0) assert ph.getScore() == 2.0 ph.setRank(30) assert ph.getRank() == 30 ph.setSequence(pyopenms.AASequence("AAA")) assert ph.getSequence().toString() == "AAA" ph.setAABefore('B') assert ph.getAABefore() == "B" ph.setAAAfter('C') assert ph.getAAAfter() == 'C' assert ph == ph assert not ph != ph
def get_mass_diff(seq1, seq2): """ Compute the mass difference for both sequences. """ # full peptide, charge = 0 m1 = oms.AASequence(seq1).getMonoWeight(0, 0) m2 = oms.AASequence(seq2).getMonoWeight(0, 0) return(np.abs(m1-m2))
def annotate(spectra, identifications): # TODO: Currently idXML returns index based spectrum reference instead of # title based spectrum reference # new_spectra = {} for spectrum_index, spectrum in enumerate(spectra, 1): spectrum_id = spectrum.identifier if spectrum_index not in identifications: # TODO: what if no ids was found? continue identification = identifications[spectrum_index] sequence = pyopenms.AASequence().fromString(identification) modifications = {} # import pdb; pdb.set_trace() sequence.size() for residue_index in range(sequence.size()): residue = sequence.getResidue(residue_index) if residue.isModified(): delta_mass = residue.getModification().getDiffMonoMass() modifications[residue_index] = delta_mass spectrum.peptide = sequence.toUnmodifiedString().decode() spectrum.modifications = modifications # new_spectrum = sus.MsmsSpectrum( # spectrum.identifier, # spectrum.precursor_mz, # spectrum.precursor_charge, # spectrum.mz, # spectrum.intensity, # peptide=sequence.toUnmodifiedString().decode(), # modifications=modifications # ) # new_spectra[spectrum_id] = new_spectrum return spectra
def simplisticBinnedScoring(self, phit, spectrum): """Simplistic phospho-scoring of a spectrum against a peptide hit. This function enumerates all possible locations for the phosphorylation and computes a similarity score between the experimental spectrum and the theoretical spectrum for each possibility. """ seq = phit.getSequence().toString() seq = seq.replace("(Phospho)", "") possibilities = [] spectrum_b = self.binSpectrum(spectrum) charge = 1 # Iterate over all possible phosphosites for m in re.finditer("[STY]", seq): new_sequence = seq[:m.start() + 1] + "(Phospho)" + seq[m.start() - 1:] new_aaseq = pyopenms.AASequence(new_sequence) # Generate theoretical spectrum spectrum_generator = pyopenms.TheoreticalSpectrumGenerator() rs = pyopenms.RichMSSpectrum() try: spectrum_generator.addPeaks(rs, new_aaseq, pyopenms.Residue.ResidueType.YIon, charge) spectrum_generator.addPeaks(rs, new_aaseq, pyopenms.Residue.ResidueType.BIon, charge) except AttributeError: # 1.11 spectrum_generator.addPeaks(rs, new_aaseq, pyopenms.ResidueType.YIon, charge) spectrum_generator.addPeaks(rs, new_aaseq, pyopenms.ResidueType.BIon, charge) theor = convertToMSSpectrum(rs) theor_b = self.binSpectrum(theor) # Compare theoretical spectrum to experimental spectrum comp_score = self.compare_binnedSpectra(spectrum_b, theor_b) possibilities.append([comp_score, new_aaseq]) # Sort the result by score, return the best scoring result possibilities.sort(lambda x, y: -cmp(x[0], y[0])) return possibilities[0]
def test_spectrum(self): intensity = [100, 100, 100, 100, 100, 100, 100] mz = [ #// four of the naked b/y ions #// as well as one of the modified b and y ions ion each 350.17164, #// b 421.20875, #// b 421.20875 + 79.9657, #// b + P 547.26291, #// y 646.33133, #// y 809.39466 + 79.9657 #// y + P ] spectrum = pyopenms.Spectrum() spectrum.setMZArray(mz) spectrum.setIntensityArray(intensity) diascoring = pyopenms.DIAScoring() diascoring.set_dia_parameters( 0.05, False, 30, 50, 4, 4 ) #; // here we use a large enough window so that none of our peaks falls out a = pyopenms.AASequence("SYVAWDR") bseries_score = 0.0 yseries_score = 0.0 charge = 1 bseries_score, yseries_score = diascoring.dia_by_ion_score( spectrum, a, charge, bseries_score, yseries_score) self.assertAlmostEqual(bseries_score, 2.0) self.assertAlmostEqual(yseries_score, 2.0) # // now add a modification to the sequence a.setModification(1, "Phospho") #; // modify the Y bseries_score = 0 yseries_score = 0 bseries_score, yseries_score = diascoring.dia_by_ion_score( spectrum, a, 1, bseries_score, yseries_score) self.assertAlmostEqual(bseries_score, 1.0) self.assertAlmostEqual(yseries_score, 3.0)
def testPeptideIdentification(): """ @tests: PeptideIdentification.__init__ PeptideIdentification.assignRanks PeptideIdentification.clearMetaInfo PeptideIdentification.empty PeptideIdentification.getHits PeptideIdentification.getIdentifier PeptideIdentification.getKeys PeptideIdentification.getMetaValue PeptideIdentification.getNonReferencingHits PeptideIdentification.getReferencingHits PeptideIdentification.getScoreType PeptideIdentification.getSignificanceThreshold PeptideIdentification.insertHit PeptideIdentification.isHigherScoreBetter PeptideIdentification.isMetaEmpty PeptideIdentification.metaValueExists PeptideIdentification.removeMetaValue PeptideIdentification.setHigherScoreBetter PeptideIdentification.setHits PeptideIdentification.setIdentifier PeptideIdentification.setMetaValue PeptideIdentification.setScoreType PeptideIdentification.sort PeptideIdentification.__eq__ PeptideIdentification.__ge__ PeptideIdentification.__gt__ PeptideIdentification.__le__ PeptideIdentification.__lt__ PeptideIdentification.__ne__ """ pi = pyopenms.PeptideIdentification() _testMetaInfoInterface(pi) assert pi == pi assert not pi != pi ph = pyopenms.PeptideHit(1.0, 1, 0, pyopenms.AASequence("A")) pi.insertHit(ph) phx, = pi.getHits() assert phx == ph pi.setHits([ph]) phx, = pi.getHits() assert phx == ph assert isinstance(pi.getSignificanceThreshold(), float) assert isinstance(pi.getScoreType(), str) pi.setScoreType("A") assert isinstance(pi.isHigherScoreBetter(), int) assert isinstance(pi.getIdentifier(), str) pi.setIdentifier("id") pi.assignRanks() pi.sort() assert not pi.empty() rv = [] pi.getReferencingHits("A", rv) assert rv == [] pi.getNonReferencingHits("A", rv) hit, = rv assert hit.getSequence().toString() == "A" assert hit.getScore() == 1.0 assert hit.getRank() == 1 rv = [] pi.getReferencingHits(["A"], rv) assert rv == [] pi.getNonReferencingHits(["A"], rv) hit, = rv assert hit.getSequence().toString() == "A" assert hit.getScore() == 1.0 assert hit.getRank() == 1 ph = pyopenms.ProteinHit() pi.getReferencingHits([ph], rv) hit, = rv assert hit.getSequence().toString() == "A" assert hit.getScore() == 1.0 assert hit.getRank() == 1 rv = [] pi.getNonReferencingHits([ph], rv) hit, = rv assert hit.getSequence().toString() == "A" assert hit.getScore() == 1.0 assert hit.getRank() == 1
def main(): """ Main function """ parser = argparse.ArgumentParser( prog='bic_CSV.py', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ --------------------------------------------------------------------- | Description: | --------------------------------------------------------------------- Command line tool to convert BICEPS output to a .csv like format. The default seperator are tabs ("\\t"). Explanations for all columns can be found on the github page (http://buotex.github.io/BICEPS/doc/html/index.html). --------------------------------------------------------------------- | Usage: | --------------------------------------------------------------------- python BICEPS_TO_CSV.py --infile input.txt --outfile output.csv ===================================================================== ''')) parser.add_argument( '--in_oms', metavar='in_oms', type=str, help='OpenMS output directory of the TextExporter node.', required=True) parser.add_argument( '--in_bic', metavar='in_bic', type=str, help= 'Input file/directory of the processed biceps results (.mut files)', required=True) parser.add_argument('--in_db', metavar='in_db', type=str, help='protein database used for the --in_bic file', required=True) parser.add_argument('--CPV_tab', metavar='CPV_tab', type=str, help='Table derived from CanProVar.', default="") parser.add_argument('--map_tab', metavar='map_tab', type=str, help='Ensemble-Uniprot mapping data', default="") parser.add_argument('--out_aux', metavar='out_aux', type=str, help='Outputfile for auxillary data files', default="") parser.add_argument('--min_diff', metavar='min_diff', type=float, help='minimum mass difference between a AA mutation', default=0.5) args = parser.parse_args() in_oms = args.in_oms in_bic = args.in_bic in_db = args.in_db CPV_tab = args.CPV_tab map_tab = args.map_tab out_ax = args.out_aux min_diff = args.min_diff print "" print "###################################################################" print "# in_oms: %s" % in_oms print "# in_bic: %s" % in_bic print "# in_db: %s" % in_db print "#" print "# out dir: %s" % out_ax print "###################################################################" print "" if not os.path.exists(out_ax): print "%s created!" % out_ax os.makedirs(out_ax) #========================================================================== # start analysis workflow #========================================================================== # in_oms = "/home/sven/data/BICEPS/HeLa/OMS_results2/TOPPAS_out/011-TextExporter-out/" # in_bic = "/home/sven/data/BICEPS/HeLa/results/" # in_db = '/home/sven/data/BICEPS/fasta/HS_reviewed_19092014.fasta' # CPV_tab = "/home/sven/data/BICEPS/var_dbs/Ensembl54_homo_cancer_dbSNP_variation_protein.fasta" # map_tab = "/home/sven/data/BICEPS/var_dbs/ensemble_uniprot_mapping_canprovar.tab" # out_ax = "/home/sven/data/BICEPS/HCT/results/HCT1/out/" print "load fasta files...", canprovar_df = CanProVar_to_table(CPV_tab, out_ax) mapping_dictionary = uniprot_mapping(map_tab) uniprot_dic = BICl.read_uniprot_to_dic(in_db, mode="seq") print "DONE!" oms_files = sorted(glob.glob(in_oms + "SAAV_candidates*.csv")) bic_files = sorted(glob.glob(in_bic + "*.txt.mut")) fdr_c = [] basename = [] snp_id = [] snp_protein = [] snps_db = [] ensemble_protein_list = [] bic_snps = [] oms_in_vardb = [] print "CSV\tDB\tOMS_in_BIC\tOMS_input\tOMS_out\tOMS_unique" for oms_in, bic_in in zip(oms_files, bic_files): db_lists = [] #datastrucutres to store the mutations that are confirmed validated_identifier = [] validated_sequence = [] validated_tag = [] validated_identifier_ens = [] accepted_oms = [] mass_diff = [] temp_snps = [] basename_oms = os.path.basename(oms_in) basename_bic = os.path.basename(bic_in) seq_dic, bic_mut_dic, bic_mut_tag_dic = read_biceps_log(bic_in) list_peps = [] c = 0 print basename_oms, print basename_bic, oms_results = pd.read_csv(oms_in, sep="\t") oms_results["unmod_sequence"] = [ oms.AASequence(seqi).toUnmodifiedString() for seqi in oms_results["sequence"] ] oms_results["single_uniprot"] = [ BICl.get_uniprot(acci).replace("_MUT", "") for acci in oms_results["accessions"] ] for seqi, uniproti in zip(oms_results["unmod_sequence"], oms_results["accessions"]): uniprot_ids_all = [ BICl.get_uniprot(i).replace("_MUT", "") for i in uniproti.split(";") if i.count("|") >= 2 ] # test if openms id in biceps results if seqi in bic_mut_dic: intersection_oms = np.unique( np.intersect1d(uniprot_ids_all, [ uniprot_ids_seqi for uniprot_ids_seqi in bic_mut_dic[seqi].keys() ])) if len(intersection_oms) >= 1: c += 1 accepted_oms.append(True) for item in intersection_oms: tag = create_mutation_tag(item, seq_dic[seqi], seqi, uniprot_dic) validated_identifier.append(item) validated_sequence.append(seqi) validated_tag.append(tag) try: validated_identifier_ens.append( mapping_dictionary[item]) except: validated_identifier_ens.append("N.A") list_peps.append(seq_dic[seqi].keys()) mass_diff.append( BICp.get_mass_diff(seqi, seq_dic[seqi].keys()[0])) else: accepted_oms.append(False) list_peps.append("") mass_diff.append(0) else: accepted_oms.append(False) list_peps.append("") mass_diff.append(0) # test if openms variation DB for uniprot_idi in uniprot_ids_all: if uniprot_idi in bic_mut_tag_dic: pass else: #probably a mutation with 2 residues continue if uniprot_idi in mapping_dictionary: ensemble_ids_temp = mapping_dictionary[uniprot_idi][ 0].split(",") for e_id in ensemble_ids_temp: try: # get more than ONE snp and transform to set gt_snps = canprovar_df.loc[e_id][ "native_id"].values except: # get more ONE snp and transform to set gt_snps = [canprovar_df.loc[e_id]["native_id"]] if len(gt_snps) != 1: sys.exit("Wrong with the number of snps...") #compare to BICEPS results intersection = np.intersect1d( adjust_mutation_position( bic_mut_tag_dic[uniprot_idi]), gt_snps) if len(intersection) != 0: db_lists.append( (e_id, ";".join(intersection), uniprot_idi)) temp_snps.extend(np.hstack(intersection)) snp_id.extend(intersection) snp_protein.append(uniprot_idi) ensemble_protein_list.append(e_id) c += 1 print c, snps_db.append(np.unique(temp_snps)) fdr_c.append(c) basename.append(basename_oms) bic_snps.append(len(bic_mut_dic)) oms_results["isBIC"] = accepted_oms oms_results["orig_seq"] = list_peps oms_results["mass_diff"] = mass_diff print oms_results.shape[0], oms_filtered = oms_results[(oms_results["isBIC"] == True) & (oms_results["mass_diff"] >= min_diff)] print oms_filtered.shape[0], oms_filtered.to_csv(out_ax + basename_oms + "_peptide_identifications.csv") print len(np.unique(oms_filtered["unmod_sequence"])) #create Provean data validated_df = pd.DataFrame() validated_df["identifier_uni"] = validated_identifier validated_df["identifier_ens"] = [ i[0] if len(i) == 1 else i for i in validated_identifier_ens ] validated_df["sequence"] = validated_sequence validated_df["tag"] = validated_tag validated_df.sort("identifier_uni", inplace=True) validated_df.drop_duplicates(inplace=True) write_to_provean_format(validated_df, out_ax + basename_oms + "_provean.csv", mass_filter=min_diff) #database variants df_vaars = pd.DataFrame() df_vaars["ensembl_id"] = [i[0] for i in db_lists] df_vaars["mutation_tags"] = [i[1] for i in db_lists] df_vaars["uniprot"] = [i[2] for i in db_lists] df_vaars = df_vaars.drop_duplicates() df_vaars.to_csv(out_ax + basename_oms + "_SNPs_canprovar.csv", sep="\t") #summary data frame for all files res_df = pd.DataFrame() res_df["SNPs_OMS"] = fdr_c res_df["SNPs_DB"] = [len(i) for i in snps_db] res_df["SNPs_BIC"] = bic_snps res_df["OMS_base"] = basename res_df["OMS"] = oms_files res_df["BIC"] = bic_files res_df.to_csv(out_ax + "SNPs_overview.csv", sep="\t")
def score(self, phit, spectrum): nr_sites = phit.getSequence().toString().count("Phospho") if nr_sites != 1: return [-1, pyopenms.AASequence()] return self.simplisticBinnedScoring(phit, spectrum)