def open(self, file=None): if not file: file = askopenfilename() if not file: return genes = quick_FASTA_reader(file) self.insert_sequence(genes[0])
def test_quick_fasta_reader(self): dna_fasta_filename = "Fasta/f002" tuple_records = quick_FASTA_reader(dna_fasta_filename) self.assertEqual(len(tuple_records), 3) seq_records = list(SeqIO.parse(dna_fasta_filename, "fasta")) self.assertEqual(len(seq_records), 3) for tuple_record, seq_record in zip(tuple_records, seq_records): self.assertEqual(tuple_record, (seq_record.description, str(seq_record.seq)))
def test_quick_fasta_reader(self): dna_fasta_filename = "Fasta/f002" tuple_records = quick_FASTA_reader(dna_fasta_filename) self.assertEqual(len(tuple_records), 3) seq_records = list(SeqIO.parse(dna_fasta_filename, "fasta")) self.assertEqual(len(seq_records), 3) for tuple_record, seq_record in zip(tuple_records, seq_records): self.assertEqual(tuple_record, (seq_record.description, str(seq_record.seq)))
def TruncationRXLRGT_RS(Input, Output, *args, **kwargs): """Comments Input is the name of input fasta file containing RXLR seq Output is the name to be use as an output file the script will truncate RXLR-EER after the EER seq if found, else after the RXLR seq if there is no strict RXLR motif, the sequence will just be ignored""" import re from Bio.SeqUtils import quick_FASTA_reader from Bio import SeqIO regex = re.compile('R.LR.{,60}?[ED][ED][KR]|R.LR.{,40}?[QD][QD]K|R.LR') #define the RxLR EER motif, looks also for alternative of the EER such as DDK, if the EER is not their, the truncation will occur after the RxLR motif #the regex has been changed, {,60} alone would look for the longest sequence possible between RxLR and EER, whereas {,60}? will take the smallest distance between RxLR and EER, so if there is two EER motif, the first one will be used #ask for the fasta file directory, the input file should be a fasta file with simple names for each sequence entries = quick_FASTA_reader(Input) #open the fasta file splitdict = {} for name, seq in entries: try: match = re.search(regex, seq) #search for the RxLR EER motif motif = match.group() #print the seq from the start codon to the EER span = match.span() #print the position of the RxLR EER motif print("Name: %s, Size: %s, Motif: %s, End: %s"%(name,len(seq), motif, span)) # some things to see if everything is working fine splitdict[name] = span[1] #add the position of the end of the RxLR EER motif in the dictionary except: #this line is their to avoid crash when the program encounter a non RxLR effector (effector that does not match with the definition given in Whisson et al 2007) continue #if continue is not there, the for loop would start from the beginning again truncated = [] file_handle=open(Input) for s in SeqIO.parse(file_handle, 'fasta'): print 'ok HERE' try: print 'ok 2' truncated.append(s[splitdict.get(s.id):]) print len(truncated) except: continue #the seqio parser will read the input file and for each sequence it will try to truncate them after the EER domain thanks to the data in the splitdict. If their is a non RxLR, the data won't be available in the splitdict. In this case the expcept loop will be apply OutputFile=open(Output, 'w') SeqIO.write(truncated, OutputFile, 'fasta') #write the truncated sequence in a fasta file OutputFile.close() print "%s RXLR truncated"%(len(truncated))
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid from Bio.SeqUtils.lcc import lcc_simp, lcc_mult from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq, MutableSeq from Bio.Alphabet import single_letter_alphabet from Bio import SeqIO ###################### # quick_FASTA_reader # ###################### dna_fasta_filename = "Fasta/f002" tuple_records = quick_FASTA_reader(dna_fasta_filename) assert len(tuple_records)==3 seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records): assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING.
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid from Bio.SeqUtils.lcc import lcc_simp, lcc_mult from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq, MutableSeq from Bio.Alphabet import single_letter_alphabet from Bio import SeqIO ###################### # quick_FASTA_reader # ###################### dna_fasta_filename = "Fasta/f002" tuple_records = quick_FASTA_reader(dna_fasta_filename) assert len(tuple_records)==3 seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records) : assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING.
key3='%s_%s'%(key1, key2) dictPotentialOverlap[key3]=deltastart, deltastop, key2, RxLRdict[key1][3], int(RxLRdict[key1][0]), int(RxLRdict[key1][1]), int(GFFdict[key2][3]) else: continue #This section above compare the location of the predicted protein set and the ones from the predicted RxLR set. ##need to check the value in the dictPotentialOverlap dictPotentialOverlap['PcRxLR515']='none', 'none', 'none', 'none', 'none', 'none', 39353 dictPotentialOverlap['PcRxLR516']='none', 'none', 'none', 'none', 'none', 'none', 531755 #Those two lines aboves add manually the effector PcRxLR 515 and 516 because they were not in the table loaded at line 10 for key in dictPotentialOverlap: print>>fileout, ">%s" %(key) print>>fileout, (dictPotentialOverlap.get(key)) Phyca11_dict1={} Phyca11_dict2={} iname3=raw_input('Enter path to the P. capsici predicted protein file: ') entries_2 = quick_FASTA_reader(iname3) for name, seq in entries_2: data2=name.split('|') Identifier=int(data2[2]) Phyca11_dict1[Identifier]=seq Phyca11_dict2[Identifier]=seq Newdict={} for keya in Phyca11_dict1: for keyb in dictPotentialOverlap: if int(keya)==dictPotentialOverlap[keyb][6]: try: Phyca11_PcRxLR="%s,%s"%(keya, keyb) Newdict[Phyca11_PcRxLR]=Phyca11_dict1.get(keya) Phyca11_dict2.pop(keya) except:
import random from Bio.SeqUtils import quick_FASTA_reader file_handle=raw_input('Enter input filepath of the P. infestans and P. capsici trimmed sequences: ') entries = quick_FASTA_reader(file_handle) seqdict={} entries=quick_FASTA_reader(file_handle) for name, seq in entries: seqdict[name]=seq bootstrapdict_1={} while len(bootstrapdict_1)<30000: entry=random.choice(seqdict.items()) key='%s'%(entry[0]) value='%s'%(entry[1]) bootstrapdict_1[key]=value bootstrapdict_2={} for name, seq in entries: entry=random.choice(seqdict.items()) key='%s'%(entry[0]) value='%s'%(entry[1]) bootstrapdict_2[key]=value bootstrapdict_3={} for name, seq in entries: entry=random.choice(seqdict.items()) key='%s'%(entry[0]) value='%s'%(entry[1])
# IMPORTS from Bio.Alphabet import IUPAC from Bio.Seq import Seq ## from Bio.SeqRecord import SeqRecord ## from Bio.SeqUtils import quick_FASTA_reader from Bio import SeqIO # This part is made by looking at LP's truncate_core_aa.py import re regex = re.compile('R.LR.{,60}?[ED][ED][KR]|R.LR.{,40}?[QD][QD]K|R.LR') #define the RxLR EER motif, looks also for alternative of the EER such as DDK, if the EER is not their, the truncation will occur after the RxLR motif #the regex has been changed, {,60} alone would look for the longest sequence possible between RxLR and EER, whereas {,60}? will take the smallest distance between RxLR and EER, so if there is two EER motif, the first one will be used file_handle='D:\Databases\All_RxLR_121216_aa_simpleName.fasta' #ask for the fasta file directory, the input file should be a fasta file with simple names for each sequence entries = quick_FASTA_reader(file_handle) #open the fasta file oname= 'D:\Databases\All_RxLR_121216_aa_simpleName_EER_trunc20131014.fasta' #ask for the output file directory, the file will be created by the script reportfilepath='D:\Databases\Report_EER_trunc20131014.fasta'#create a file containing all the sequence that do not have a RxLR motif as defined by the regex reportfile=open(reportfilepath, 'w') print>>reportfile, 'This file contain the sequences that do not have a predicted RxLR motif' # make a dictionary with the positions where to split the sequences splitdict = {} for name, seq in entries: try: match = re.search(regex, seq) #search for the RxLR EER motif motif = match.group() #print the seq from the start codon to the EER span = match.span() #print the position of the RxLR EER motif print("Name: %s, Size: %s, Motif: %s, End: %s"%(name,len(seq), motif, span)) # some things to see if everything is working fine splitdict[name] = span[1] #add the position of the end of the RxLR EER motif in the dictionary except: #this line is their to avoid crash when the program encounter a non RxLR effector (effector that does not match with the definition given in Whisson et al 2007)