Esempio n. 1
0
class TestMultiProcessingLog(unittest.TestCase):

    def setUp(self):
        self.dirname = os.path.dirname(os.path.abspath(__file__))
        self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta')
        self.db = ProteinDB()
        self.db.readFasta(self.fastafile)
        self.P31946 = self.db.proteinDictionary['P31946']

    def test_readFasta(self):
        self.code1 = self.P31946.code1
        self.code2 = self.P31946.code2
        self.modres = self.P31946.modres
        self.ncbi_tax_id = self.P31946.ncbi_tax_id
        self.description = self.P31946.description
        self.sequence = self.P31946.sequence

        print self.code1
        print self.code2
        print self.modres
        print self.ncbi_tax_id
        print self.description
        print self.sequence


    def test_get_proteins_containing_peptide(
            self):  #To-Do : test with some peptides present and not present. Also shared peptides.
        pass

    def test_pseudoreverseDB(
            self):  #To-Do : Pseudo-reverse and check a couple of reversed proteins. Pseudo-reverse the DB twice, and check the DB is equal.
        pass
Esempio n. 2
0
def writeDigestFile(fastafile, enzyme, minPepLength=5, missedCleavages=0):
    base = os.path.splitext(fastafile)[0]
    digest = base + '_digest.tsv'
    dynfile = base + '_simulation.tsv'
    headers = ['protein_code', 'sequence']
    peptidelib = {}

    db = ProteinDB()
    db.readFasta(fastafile)
    try:
        writer = csv.writer(open(digest, 'w'), dialect='excel-tab')
    except:
        print("something went wrong while trying to write the file :",
              massfile)
        sys.exit(1)

    writer.writerow(headers)
    protein_cnt = 0
    for code, protein in db.proteinDictionary.items():
        protein_cnt += 1
        if protein_cnt % 5000 == 0: print("%s proteins stored" % protein_cnt)
        peptides = protein.digest(enzyme,
                                  minLength=minPepLength,
                                  missedCleavages=missedCleavages)
        for peptide in peptides:
            wr_row = [protein.code2, peptide]
            writer.writerow(wr_row)
def main(argv):
    
    fasta = argv[0]
    fasta_new = fasta + ".new"
    
    #Read fasta file
    protDB = ProteinDB()
    protDB.readFasta(fasta)
    
    for protein in protDB.proteinDictionary.values() :
        code1 = protein.code1
        code2 = protein.code2
        if len(code1) == 0 : code1 = code2 
        if len(code2) == 0 : code2 = code1
        protein.code1 = code2
        protein.code2 = code1

    protDB.writeFastaFile(fasta_new, chunksize = -1, format="sp")
Esempio n. 4
0
class TestUnitProtein(unittest.TestCase):
    def setUp(self):
        #>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=H**o sapiens GN=YWHAB PE=1 SV=3
        #MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS
        #WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY
        #LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY
        #YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD
        #AGEGEN
        self.dirname = os.path.dirname(os.path.abspath(__file__))
        self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta')
        self.db = ProteinDB()
        self.db.readFasta(self.fastafile)
        self.P31946 = self.db.proteinDictionary['P31946']
        self.trypsin = {'terminus': 'C', 'cleave': ['K', 'R'], 'exceptions': ['KP', 'RP']}
        self.Lys_N = {'terminus': 'N', 'cleave': ['K'], 'exceptions': []}
        self.pep_tryp1 = 'TAFDEAIAELDTLNEESYK'
        self.pep_Lys_N = 'KGDYFRYLSEVASGDN'


    def test_proteinWeight(self):
        pass


    def test_digest(self):
        self.P31946_peptides_trypsin = self.P31946.digest(self.trypsin)

        self.assertIn(self.pep_tryp1, self.P31946_peptides_trypsin,
                      "A tryptic peptide has not been found after tryptic digestion of a protein!")
        self.assertNotIn(self.pep_Lys_N, self.P31946_peptides_trypsin,
                         "A non tryptic peptide has been found after tryptic digestion of a protein!")

        self.P31946_peptides_Lys_N = self.P31946.digest(self.Lys_N)
        self.assertNotIn(self.pep_tryp1, self.P31946_peptides_Lys_N,
                         "A  tryptic peptide has been found after tryptic digestion of a protein!")
        self.assertIn(self.pep_Lys_N, self.P31946_peptides_Lys_N,
                      "A Lys-N peptide has not been found after tryptic digestion of a protein!")
def writeDigestFile(fastafile,enzyme, minPepLength = 5, missedCleavages = 0) :
    base = os.path.splitext(fastafile)[0]
    digest = base + '_digest.tsv'
    dynfile  = base + '_simulation.tsv'
    headers = ['protein_code','sequence']
    peptidelib = {} 
    
    db = ProteinDB()
    db.readFasta(fastafile)
    try :
        writer = csv.writer(open(digest,'w'), dialect='excel-tab')
    except :
        print "something went wrong while trying to write the file :" , massfile
        sys.exit(1)
    
    writer.writerow(headers)
    protein_cnt = 0
    for code, protein in db.proteinDictionary.iteritems() :
        protein_cnt += 1
        if protein_cnt % 5000 == 0 : print "%s proteins stored" % protein_cnt
        peptides = protein.digest(enzyme, minLength = minPepLength, missedCleavages = missedCleavages)
        for peptide in peptides :
            wr_row = [protein.code2, peptide]
            writer.writerow(wr_row)
Esempio n. 6
0
 def setUp(self):
     #>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=H**o sapiens GN=YWHAB PE=1 SV=3
     #MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS
     #WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY
     #LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY
     #YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD
     #AGEGEN
     self.dirname = os.path.dirname(os.path.abspath(__file__))
     self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta')
     self.db = ProteinDB()
     self.db.readFasta(self.fastafile)
     self.P31946 = self.db.proteinDictionary['P31946']
     self.trypsin = {'terminus': 'C', 'cleave': ['K', 'R'], 'exceptions': ['KP', 'RP']}
     self.Lys_N = {'terminus': 'N', 'cleave': ['K'], 'exceptions': []}
     self.pep_tryp1 = 'TAFDEAIAELDTLNEESYK'
     self.pep_Lys_N = 'KGDYFRYLSEVASGDN'
Esempio n. 7
0
def main(argv):

    fasta = argv[0]
    fasta_new = fasta + ".new"

    #Read fasta file
    protDB = ProteinDB()
    protDB.readFasta(fasta)

    for protein in protDB.proteinDictionary.values():
        code1 = protein.code1
        code2 = protein.code2
        if len(code1) == 0: code1 = code2
        if len(code2) == 0: code2 = code1
        protein.code1 = code2
        protein.code2 = code1

    protDB.writeFastaFile(fasta_new, chunksize=-1, format="sp")
def writeMassFile(fastafile,enzyme, abundances, dynrange, minPepLength = 5, peptidesID = 0, minPepProteinID = 1) :
	base = os.path.splitext(fastafile)[0]
	massfile = base + '_mass.tsv'
	dynfile  = base + '_simulation.tsv'
	headers = ['protein_code','weight','num_of_peptides']
	headers_dyn = ['protein_code','num_of_peptides','peptide','dyn_range','protein_abundance','pep_estimated_intensity','peptide_shared_with_detected_proteins']
	peptidelib = {} # dictionary containing the simulated detectability and the protein abundances : peptides = [ detectability , { "SEQUENCE1" : { code1 : Abundance1, ...}, ... }]
	
	db = ProteinDB()
	db.readFasta(fastafile)
	try :
		writer = csv.writer(open(massfile,'w'), dialect='excel-tab')
	except :
		print("something went wrong while trying to write the file :" , massfile)
		sys.exit(1)
	if dynrange :
		try :
			writer_dyn = csv.writer(open(dynfile,'w'), dialect='excel-tab')
		except :
			print("something went wrong while trying to write the file :" , dynfile)
			sys.exit(1)
	
	if len(abundances) > 0 : headers.extend(['protein_abundance'])
	writer.writerow(headers)
	if dynrange : writer_dyn.writerow(headers_dyn)
	protein_cnt = 0
	for code, protein in db.proteinDictionary.items() :
		protein_cnt += 1
		if protein_cnt % 5000 == 0 : print("%s proteins stored" % protein_cnt)
		peptides = protein.digest(enzyme, minLength = minPepLength)
		wr_row = [protein.code2, protein.proteinWeight(), len(peptides)]
		this_abundance = -1
		if protein.code2 in abundances : 
			this_abundance = abundances[protein.code2]
			wr_row.extend([this_abundance])
		writer.writerow(wr_row)
		if dynrange and this_abundance > 0 : 
			for pep in peptides :
				if  pep not in peptidelib   : peptidelib[pep] = [ randomPepdist() , {protein : float(this_abundance) }]
				else 						: peptidelib[pep][1][protein] = float(this_abundance)
	
	
	protein_cnt = 0
	all_peptides = []
	for code, protein in db.proteinDictionary.items() :
		protein_cnt += 1
		if protein_cnt % 5000 == 0 : print("%s proteins simulated" % protein_cnt)
		if protein.code2 not in abundances : continue 
		this_abundance = -1
		this_abundance = abundances[protein.code2]
		if this_abundance < 0 : continue
		peptides = protein.digest(enzyme, minLength = minPepLength)
		for pep in peptides :
			if pep not in peptidelib : continue
			#print peptidelib[pep][0] , list(peptidelib[pep][1].itervalues())
			intensity = peptidelib[pep][0] * sum( list(peptidelib[pep][1].values()) )
			proteins_txt = ""
			if len(peptidelib[pep][1]) > 1 :
				proteins_txt = ",".join( [ str(i.code2) for i in peptidelib[pep][1].keys()] )
			wr_row = [protein.code2, len(peptides), pep, peptidelib[pep][0], this_abundance ,intensity,proteins_txt]
			all_peptides.append(wr_row)
			writer_dyn.writerow(wr_row)

	#Sort the peptides by estimated intensity
	all_peptides = sorted(all_peptides, key= lambda x: (-x[5]))
	
	protein_peptides = {}
	pep2_cnt = 0 	# Count peptides that are in proteins with at least another peptide already identified. IF it is the second peptide of the protein identified, count it twice so that we count the first one 
	pep_idx = 0
	while pep2_cnt < peptidesID and pep_idx < len(all_peptides) :
		this_peptide = all_peptides[pep_idx]
		this_sequence = this_peptide[2]
		this_protcode = this_peptide[0]
		this_intensity = this_peptide[5]
		if this_peptide[0] not in protein_peptides : protein_peptides[this_protcode] = { this_sequence : this_intensity }
		else :
			protein_peptides[this_protcode][this_sequence] = this_intensity
			if len(protein_peptides[this_protcode]) == minPepProteinID : pep2_cnt += minPepProteinID
			elif len(protein_peptides[this_protcode]) > minPepProteinID: pep2_cnt += 1
		pep_idx += 1

	#Count the number of proteins with 2 or more peptides, and proteins with only 1 peptide
	protein_2peps = 0
	protein_1pep  = 0
	for protein in protein_peptides.values() :
		#print len(protein), protein
		if len(protein) >= minPepProteinID : protein_2peps += 1
		else : 	protein_1pep += 1
	
	print("Number of proteins identified with at least %s peptides (given %s identified peptides) : %s" % (minPepProteinID,peptidesID,protein_2peps)) 
	print("Number of proteins identified with less than %s peptides (given %s identified peptides) : %s" % (minPepProteinID,peptidesID,protein_1pep)) 
Esempio n. 9
0
 def setUp(self):
     self.dirname = os.path.dirname(os.path.abspath(__file__))
     self.fastafile = os.path.join(self.dirname, "..", "data", 'smallDB.fasta')
     self.db = ProteinDB()
     self.db.readFasta(self.fastafile)
     self.P31946 = self.db.proteinDictionary['P31946']
def writeMassFile(fastafile,
                  enzyme,
                  abundances,
                  dynrange,
                  minPepLength=5,
                  peptidesID=0,
                  minPepProteinID=1):
    base = os.path.splitext(fastafile)[0]
    massfile = base + '_mass.tsv'
    dynfile = base + '_simulation.tsv'
    headers = ['protein_code', 'weight', 'num_of_peptides']
    headers_dyn = [
        'protein_code', 'num_of_peptides', 'peptide', 'dyn_range',
        'protein_abundance', 'pep_estimated_intensity',
        'peptide_shared_with_detected_proteins'
    ]
    peptidelib = {
    }  # dictionary containing the simulated detectability and the protein abundances : peptides = [ detectability , { "SEQUENCE1" : { code1 : Abundance1, ...}, ... }]

    db = ProteinDB()
    db.readFasta(fastafile)
    try:
        writer = csv.writer(open(massfile, 'w'), dialect='excel-tab')
    except:
        print "something went wrong while trying to write the file :", massfile
        sys.exit(1)
    if dynrange:
        try:
            writer_dyn = csv.writer(open(dynfile, 'w'), dialect='excel-tab')
        except:
            print "something went wrong while trying to write the file :", dynfile
            sys.exit(1)

    if len(abundances) > 0: headers.extend(['protein_abundance'])
    writer.writerow(headers)
    if dynrange: writer_dyn.writerow(headers_dyn)
    protein_cnt = 0
    for code, protein in db.proteinDictionary.iteritems():
        protein_cnt += 1
        if protein_cnt % 5000 == 0: print "%s proteins stored" % protein_cnt
        peptides = protein.digest(enzyme, minLength=minPepLength)
        wr_row = [protein.code2, protein.proteinWeight(), len(peptides)]
        this_abundance = -1
        if protein.code2 in abundances:
            this_abundance = abundances[protein.code2]
            wr_row.extend([this_abundance])
        writer.writerow(wr_row)
        if dynrange and this_abundance > 0:
            for pep in peptides:
                if pep not in peptidelib:
                    peptidelib[pep] = [
                        randomPepdist(), {
                            protein: float(this_abundance)
                        }
                    ]
                else:
                    peptidelib[pep][1][protein] = float(this_abundance)

    protein_cnt = 0
    all_peptides = []
    for code, protein in db.proteinDictionary.iteritems():
        protein_cnt += 1
        if protein_cnt % 5000 == 0: print "%s proteins simulated" % protein_cnt
        if protein.code2 not in abundances: continue
        this_abundance = -1
        this_abundance = abundances[protein.code2]
        if this_abundance < 0: continue
        peptides = protein.digest(enzyme, minLength=minPepLength)
        for pep in peptides:
            if pep not in peptidelib: continue
            #print peptidelib[pep][0] , list(peptidelib[pep][1].itervalues())
            intensity = peptidelib[pep][0] * sum(
                list(peptidelib[pep][1].itervalues()))
            proteins_txt = ""
            if len(peptidelib[pep][1]) > 1:
                proteins_txt = ",".join(
                    [str(i.code2) for i in peptidelib[pep][1].iterkeys()])
            wr_row = [
                protein.code2,
                len(peptides), pep, peptidelib[pep][0], this_abundance,
                intensity, proteins_txt
            ]
            all_peptides.append(wr_row)
            writer_dyn.writerow(wr_row)

    #Sort the peptides by estimated intensity
    all_peptides = sorted(all_peptides, key=lambda x: (-x[5]))

    protein_peptides = {}
    pep2_cnt = 0  # Count peptides that are in proteins with at least another peptide already identified. IF it is the second peptide of the protein identified, count it twice so that we count the first one
    pep_idx = 0
    while pep2_cnt < peptidesID and pep_idx < len(all_peptides):
        this_peptide = all_peptides[pep_idx]
        this_sequence = this_peptide[2]
        this_protcode = this_peptide[0]
        this_intensity = this_peptide[5]
        if this_peptide[0] not in protein_peptides:
            protein_peptides[this_protcode] = {this_sequence: this_intensity}
        else:
            protein_peptides[this_protcode][this_sequence] = this_intensity
            if len(protein_peptides[this_protcode]) == minPepProteinID:
                pep2_cnt += minPepProteinID
            elif len(protein_peptides[this_protcode]) > minPepProteinID:
                pep2_cnt += 1
        pep_idx += 1

    #Count the number of proteins with 2 or more peptides, and proteins with only 1 peptide
    protein_2peps = 0
    protein_1pep = 0
    for protein in protein_peptides.itervalues():
        #print len(protein), protein
        if len(protein) >= minPepProteinID: protein_2peps += 1
        else: protein_1pep += 1

    print "Number of proteins identified with at least %s peptides (given %s identified peptides) : %s" % (
        minPepProteinID, peptidesID, protein_2peps)
    print "Number of proteins identified with less than %s peptides (given %s identified peptides) : %s" % (
        minPepProteinID, peptidesID, protein_1pep)