class TestMultiProcessingLog(unittest.TestCase): def setUp(self): self.dirname = os.path.dirname(os.path.abspath(__file__)) self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta') self.db = ProteinDB() self.db.readFasta(self.fastafile) self.P31946 = self.db.proteinDictionary['P31946'] def test_readFasta(self): self.code1 = self.P31946.code1 self.code2 = self.P31946.code2 self.modres = self.P31946.modres self.ncbi_tax_id = self.P31946.ncbi_tax_id self.description = self.P31946.description self.sequence = self.P31946.sequence print self.code1 print self.code2 print self.modres print self.ncbi_tax_id print self.description print self.sequence def test_get_proteins_containing_peptide( self): #To-Do : test with some peptides present and not present. Also shared peptides. pass def test_pseudoreverseDB( self): #To-Do : Pseudo-reverse and check a couple of reversed proteins. Pseudo-reverse the DB twice, and check the DB is equal. pass
def writeDigestFile(fastafile, enzyme, minPepLength=5, missedCleavages=0): base = os.path.splitext(fastafile)[0] digest = base + '_digest.tsv' dynfile = base + '_simulation.tsv' headers = ['protein_code', 'sequence'] peptidelib = {} db = ProteinDB() db.readFasta(fastafile) try: writer = csv.writer(open(digest, 'w'), dialect='excel-tab') except: print("something went wrong while trying to write the file :", massfile) sys.exit(1) writer.writerow(headers) protein_cnt = 0 for code, protein in db.proteinDictionary.items(): protein_cnt += 1 if protein_cnt % 5000 == 0: print("%s proteins stored" % protein_cnt) peptides = protein.digest(enzyme, minLength=minPepLength, missedCleavages=missedCleavages) for peptide in peptides: wr_row = [protein.code2, peptide] writer.writerow(wr_row)
def main(argv): fasta = argv[0] fasta_new = fasta + ".new" #Read fasta file protDB = ProteinDB() protDB.readFasta(fasta) for protein in protDB.proteinDictionary.values() : code1 = protein.code1 code2 = protein.code2 if len(code1) == 0 : code1 = code2 if len(code2) == 0 : code2 = code1 protein.code1 = code2 protein.code2 = code1 protDB.writeFastaFile(fasta_new, chunksize = -1, format="sp")
class TestUnitProtein(unittest.TestCase): def setUp(self): #>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=H**o sapiens GN=YWHAB PE=1 SV=3 #MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS #WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY #LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY #YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD #AGEGEN self.dirname = os.path.dirname(os.path.abspath(__file__)) self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta') self.db = ProteinDB() self.db.readFasta(self.fastafile) self.P31946 = self.db.proteinDictionary['P31946'] self.trypsin = {'terminus': 'C', 'cleave': ['K', 'R'], 'exceptions': ['KP', 'RP']} self.Lys_N = {'terminus': 'N', 'cleave': ['K'], 'exceptions': []} self.pep_tryp1 = 'TAFDEAIAELDTLNEESYK' self.pep_Lys_N = 'KGDYFRYLSEVASGDN' def test_proteinWeight(self): pass def test_digest(self): self.P31946_peptides_trypsin = self.P31946.digest(self.trypsin) self.assertIn(self.pep_tryp1, self.P31946_peptides_trypsin, "A tryptic peptide has not been found after tryptic digestion of a protein!") self.assertNotIn(self.pep_Lys_N, self.P31946_peptides_trypsin, "A non tryptic peptide has been found after tryptic digestion of a protein!") self.P31946_peptides_Lys_N = self.P31946.digest(self.Lys_N) self.assertNotIn(self.pep_tryp1, self.P31946_peptides_Lys_N, "A tryptic peptide has been found after tryptic digestion of a protein!") self.assertIn(self.pep_Lys_N, self.P31946_peptides_Lys_N, "A Lys-N peptide has not been found after tryptic digestion of a protein!")
def writeDigestFile(fastafile,enzyme, minPepLength = 5, missedCleavages = 0) : base = os.path.splitext(fastafile)[0] digest = base + '_digest.tsv' dynfile = base + '_simulation.tsv' headers = ['protein_code','sequence'] peptidelib = {} db = ProteinDB() db.readFasta(fastafile) try : writer = csv.writer(open(digest,'w'), dialect='excel-tab') except : print "something went wrong while trying to write the file :" , massfile sys.exit(1) writer.writerow(headers) protein_cnt = 0 for code, protein in db.proteinDictionary.iteritems() : protein_cnt += 1 if protein_cnt % 5000 == 0 : print "%s proteins stored" % protein_cnt peptides = protein.digest(enzyme, minLength = minPepLength, missedCleavages = missedCleavages) for peptide in peptides : wr_row = [protein.code2, peptide] writer.writerow(wr_row)
def setUp(self): #>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=H**o sapiens GN=YWHAB PE=1 SV=3 #MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS #WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY #LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY #YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD #AGEGEN self.dirname = os.path.dirname(os.path.abspath(__file__)) self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta') self.db = ProteinDB() self.db.readFasta(self.fastafile) self.P31946 = self.db.proteinDictionary['P31946'] self.trypsin = {'terminus': 'C', 'cleave': ['K', 'R'], 'exceptions': ['KP', 'RP']} self.Lys_N = {'terminus': 'N', 'cleave': ['K'], 'exceptions': []} self.pep_tryp1 = 'TAFDEAIAELDTLNEESYK' self.pep_Lys_N = 'KGDYFRYLSEVASGDN'
def main(argv): fasta = argv[0] fasta_new = fasta + ".new" #Read fasta file protDB = ProteinDB() protDB.readFasta(fasta) for protein in protDB.proteinDictionary.values(): code1 = protein.code1 code2 = protein.code2 if len(code1) == 0: code1 = code2 if len(code2) == 0: code2 = code1 protein.code1 = code2 protein.code2 = code1 protDB.writeFastaFile(fasta_new, chunksize=-1, format="sp")
def writeMassFile(fastafile,enzyme, abundances, dynrange, minPepLength = 5, peptidesID = 0, minPepProteinID = 1) : base = os.path.splitext(fastafile)[0] massfile = base + '_mass.tsv' dynfile = base + '_simulation.tsv' headers = ['protein_code','weight','num_of_peptides'] headers_dyn = ['protein_code','num_of_peptides','peptide','dyn_range','protein_abundance','pep_estimated_intensity','peptide_shared_with_detected_proteins'] peptidelib = {} # dictionary containing the simulated detectability and the protein abundances : peptides = [ detectability , { "SEQUENCE1" : { code1 : Abundance1, ...}, ... }] db = ProteinDB() db.readFasta(fastafile) try : writer = csv.writer(open(massfile,'w'), dialect='excel-tab') except : print("something went wrong while trying to write the file :" , massfile) sys.exit(1) if dynrange : try : writer_dyn = csv.writer(open(dynfile,'w'), dialect='excel-tab') except : print("something went wrong while trying to write the file :" , dynfile) sys.exit(1) if len(abundances) > 0 : headers.extend(['protein_abundance']) writer.writerow(headers) if dynrange : writer_dyn.writerow(headers_dyn) protein_cnt = 0 for code, protein in db.proteinDictionary.items() : protein_cnt += 1 if protein_cnt % 5000 == 0 : print("%s proteins stored" % protein_cnt) peptides = protein.digest(enzyme, minLength = minPepLength) wr_row = [protein.code2, protein.proteinWeight(), len(peptides)] this_abundance = -1 if protein.code2 in abundances : this_abundance = abundances[protein.code2] wr_row.extend([this_abundance]) writer.writerow(wr_row) if dynrange and this_abundance > 0 : for pep in peptides : if pep not in peptidelib : peptidelib[pep] = [ randomPepdist() , {protein : float(this_abundance) }] else : peptidelib[pep][1][protein] = float(this_abundance) protein_cnt = 0 all_peptides = [] for code, protein in db.proteinDictionary.items() : protein_cnt += 1 if protein_cnt % 5000 == 0 : print("%s proteins simulated" % protein_cnt) if protein.code2 not in abundances : continue this_abundance = -1 this_abundance = abundances[protein.code2] if this_abundance < 0 : continue peptides = protein.digest(enzyme, minLength = minPepLength) for pep in peptides : if pep not in peptidelib : continue #print peptidelib[pep][0] , list(peptidelib[pep][1].itervalues()) intensity = peptidelib[pep][0] * sum( list(peptidelib[pep][1].values()) ) proteins_txt = "" if len(peptidelib[pep][1]) > 1 : proteins_txt = ",".join( [ str(i.code2) for i in peptidelib[pep][1].keys()] ) wr_row = [protein.code2, len(peptides), pep, peptidelib[pep][0], this_abundance ,intensity,proteins_txt] all_peptides.append(wr_row) writer_dyn.writerow(wr_row) #Sort the peptides by estimated intensity all_peptides = sorted(all_peptides, key= lambda x: (-x[5])) protein_peptides = {} pep2_cnt = 0 # Count peptides that are in proteins with at least another peptide already identified. IF it is the second peptide of the protein identified, count it twice so that we count the first one pep_idx = 0 while pep2_cnt < peptidesID and pep_idx < len(all_peptides) : this_peptide = all_peptides[pep_idx] this_sequence = this_peptide[2] this_protcode = this_peptide[0] this_intensity = this_peptide[5] if this_peptide[0] not in protein_peptides : protein_peptides[this_protcode] = { this_sequence : this_intensity } else : protein_peptides[this_protcode][this_sequence] = this_intensity if len(protein_peptides[this_protcode]) == minPepProteinID : pep2_cnt += minPepProteinID elif len(protein_peptides[this_protcode]) > minPepProteinID: pep2_cnt += 1 pep_idx += 1 #Count the number of proteins with 2 or more peptides, and proteins with only 1 peptide protein_2peps = 0 protein_1pep = 0 for protein in protein_peptides.values() : #print len(protein), protein if len(protein) >= minPepProteinID : protein_2peps += 1 else : protein_1pep += 1 print("Number of proteins identified with at least %s peptides (given %s identified peptides) : %s" % (minPepProteinID,peptidesID,protein_2peps)) print("Number of proteins identified with less than %s peptides (given %s identified peptides) : %s" % (minPepProteinID,peptidesID,protein_1pep))
def setUp(self): self.dirname = os.path.dirname(os.path.abspath(__file__)) self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta') self.db = ProteinDB() self.db.readFasta(self.fastafile) self.P31946 = self.db.proteinDictionary['P31946']
def writeMassFile(fastafile, enzyme, abundances, dynrange, minPepLength=5, peptidesID=0, minPepProteinID=1): base = os.path.splitext(fastafile)[0] massfile = base + '_mass.tsv' dynfile = base + '_simulation.tsv' headers = ['protein_code', 'weight', 'num_of_peptides'] headers_dyn = [ 'protein_code', 'num_of_peptides', 'peptide', 'dyn_range', 'protein_abundance', 'pep_estimated_intensity', 'peptide_shared_with_detected_proteins' ] peptidelib = { } # dictionary containing the simulated detectability and the protein abundances : peptides = [ detectability , { "SEQUENCE1" : { code1 : Abundance1, ...}, ... }] db = ProteinDB() db.readFasta(fastafile) try: writer = csv.writer(open(massfile, 'w'), dialect='excel-tab') except: print "something went wrong while trying to write the file :", massfile sys.exit(1) if dynrange: try: writer_dyn = csv.writer(open(dynfile, 'w'), dialect='excel-tab') except: print "something went wrong while trying to write the file :", dynfile sys.exit(1) if len(abundances) > 0: headers.extend(['protein_abundance']) writer.writerow(headers) if dynrange: writer_dyn.writerow(headers_dyn) protein_cnt = 0 for code, protein in db.proteinDictionary.iteritems(): protein_cnt += 1 if protein_cnt % 5000 == 0: print "%s proteins stored" % protein_cnt peptides = protein.digest(enzyme, minLength=minPepLength) wr_row = [protein.code2, protein.proteinWeight(), len(peptides)] this_abundance = -1 if protein.code2 in abundances: this_abundance = abundances[protein.code2] wr_row.extend([this_abundance]) writer.writerow(wr_row) if dynrange and this_abundance > 0: for pep in peptides: if pep not in peptidelib: peptidelib[pep] = [ randomPepdist(), { protein: float(this_abundance) } ] else: peptidelib[pep][1][protein] = float(this_abundance) protein_cnt = 0 all_peptides = [] for code, protein in db.proteinDictionary.iteritems(): protein_cnt += 1 if protein_cnt % 5000 == 0: print "%s proteins simulated" % protein_cnt if protein.code2 not in abundances: continue this_abundance = -1 this_abundance = abundances[protein.code2] if this_abundance < 0: continue peptides = protein.digest(enzyme, minLength=minPepLength) for pep in peptides: if pep not in peptidelib: continue #print peptidelib[pep][0] , list(peptidelib[pep][1].itervalues()) intensity = peptidelib[pep][0] * sum( list(peptidelib[pep][1].itervalues())) proteins_txt = "" if len(peptidelib[pep][1]) > 1: proteins_txt = ",".join( [str(i.code2) for i in peptidelib[pep][1].iterkeys()]) wr_row = [ protein.code2, len(peptides), pep, peptidelib[pep][0], this_abundance, intensity, proteins_txt ] all_peptides.append(wr_row) writer_dyn.writerow(wr_row) #Sort the peptides by estimated intensity all_peptides = sorted(all_peptides, key=lambda x: (-x[5])) protein_peptides = {} pep2_cnt = 0 # Count peptides that are in proteins with at least another peptide already identified. IF it is the second peptide of the protein identified, count it twice so that we count the first one pep_idx = 0 while pep2_cnt < peptidesID and pep_idx < len(all_peptides): this_peptide = all_peptides[pep_idx] this_sequence = this_peptide[2] this_protcode = this_peptide[0] this_intensity = this_peptide[5] if this_peptide[0] not in protein_peptides: protein_peptides[this_protcode] = {this_sequence: this_intensity} else: protein_peptides[this_protcode][this_sequence] = this_intensity if len(protein_peptides[this_protcode]) == minPepProteinID: pep2_cnt += minPepProteinID elif len(protein_peptides[this_protcode]) > minPepProteinID: pep2_cnt += 1 pep_idx += 1 #Count the number of proteins with 2 or more peptides, and proteins with only 1 peptide protein_2peps = 0 protein_1pep = 0 for protein in protein_peptides.itervalues(): #print len(protein), protein if len(protein) >= minPepProteinID: protein_2peps += 1 else: protein_1pep += 1 print "Number of proteins identified with at least %s peptides (given %s identified peptides) : %s" % ( minPepProteinID, peptidesID, protein_2peps) print "Number of proteins identified with less than %s peptides (given %s identified peptides) : %s" % ( minPepProteinID, peptidesID, protein_1pep)