Exemple #1
0
 def open(self, file=None):
     if not file:
         file = askopenfilename()
     if not file:
         return
     genes = quick_FASTA_reader(file)
     self.insert_sequence(genes[0])
Exemple #2
0
    def test_quick_fasta_reader(self):
        dna_fasta_filename = "Fasta/f002"

        tuple_records = quick_FASTA_reader(dna_fasta_filename)
        self.assertEqual(len(tuple_records), 3)
        seq_records = list(SeqIO.parse(dna_fasta_filename, "fasta"))
        self.assertEqual(len(seq_records), 3)
        for tuple_record, seq_record in zip(tuple_records, seq_records):
            self.assertEqual(tuple_record, (seq_record.description, str(seq_record.seq)))
Exemple #3
0
    def test_quick_fasta_reader(self):
        dna_fasta_filename = "Fasta/f002"

        tuple_records = quick_FASTA_reader(dna_fasta_filename)
        self.assertEqual(len(tuple_records), 3)
        seq_records = list(SeqIO.parse(dna_fasta_filename, "fasta"))
        self.assertEqual(len(seq_records), 3)
        for tuple_record, seq_record in zip(tuple_records, seq_records):
            self.assertEqual(tuple_record, (seq_record.description, str(seq_record.seq)))
def TruncationRXLRGT_RS(Input, Output, *args, **kwargs):
    """Comments
    Input is the name of input fasta file containing RXLR seq
    Output is the name to be use as an output file
    the script will truncate RXLR-EER after the EER seq if found, else after the RXLR seq
    if there is no strict RXLR motif, the sequence will just be ignored"""
    import re
    from Bio.SeqUtils import quick_FASTA_reader
    from Bio import SeqIO
    regex = re.compile('R.LR.{,60}?[ED][ED][KR]|R.LR.{,40}?[QD][QD]K|R.LR') #define the RxLR EER motif, looks also for alternative of the EER such as DDK, if the EER is not their, the truncation will occur after the RxLR motif
    #the regex has been changed, {,60} alone would look for the longest sequence possible between RxLR and EER, whereas {,60}? will take the smallest distance between RxLR and EER, so if there is two EER motif, the first one will be used
     #ask for the fasta file directory, the input file should be a fasta file with simple names for each sequence
    entries = quick_FASTA_reader(Input) #open the fasta file
    splitdict = {}
    for name, seq in entries:
        try:
            match = re.search(regex, seq) #search for the RxLR EER motif
            motif = match.group() #print the seq from the start codon to the EER 
            span = match.span() #print the position of the RxLR EER motif
            print("Name: %s, Size: %s, Motif: %s, End: %s"%(name,len(seq), motif, span)) # some things to see if everything is working fine
            splitdict[name] = span[1] #add the position of the end of the RxLR EER motif in the dictionary
        except: #this line is their to avoid crash when the program encounter a non RxLR effector (effector that does not match with the definition given in Whisson et al 2007)
            continue #if continue is not there, the for loop would start from the beginning again
    truncated = []
    file_handle=open(Input)
    for s in SeqIO.parse(file_handle, 'fasta'):
        print 'ok HERE'
        try:
            print 'ok 2'
            truncated.append(s[splitdict.get(s.id):])
            print len(truncated)
        except:
            continue 

#the seqio parser will read the input file and for each sequence it will try to truncate them after the EER domain thanks to the data in the splitdict. If their is a non RxLR, the data won't be available in the splitdict. In this case the expcept loop will be apply
    OutputFile=open(Output, 'w')
    SeqIO.write(truncated, OutputFile, 'fasta')  #write the truncated sequence in a fasta file
    OutputFile.close()
    print "%s RXLR truncated"%(len(truncated))
Exemple #5
0
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq, MutableSeq
from Bio.Alphabet import single_letter_alphabet
from Bio import SeqIO


######################
# quick_FASTA_reader #
######################

dna_fasta_filename = "Fasta/f002"

tuple_records = quick_FASTA_reader(dna_fasta_filename)
assert len(tuple_records)==3
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records):
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq, MutableSeq
from Bio.Alphabet import single_letter_alphabet
from Bio import SeqIO


######################
# quick_FASTA_reader #
######################

dna_fasta_filename = "Fasta/f002"

tuple_records = quick_FASTA_reader(dna_fasta_filename)
assert len(tuple_records)==3
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records) :
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
Exemple #7
0
                key3='%s_%s'%(key1, key2)
                dictPotentialOverlap[key3]=deltastart, deltastop, key2, RxLRdict[key1][3], int(RxLRdict[key1][0]), int(RxLRdict[key1][1]), int(GFFdict[key2][3])
        else:
            continue
#This section above compare the location of the predicted protein set and the ones from the predicted RxLR set. 
##need to check the value in the dictPotentialOverlap
dictPotentialOverlap['PcRxLR515']='none', 'none', 'none', 'none', 'none', 'none', 39353
dictPotentialOverlap['PcRxLR516']='none', 'none', 'none', 'none', 'none', 'none', 531755
#Those two lines aboves add manually the effector PcRxLR 515 and 516 because they were not in the table loaded at line 10
for key in dictPotentialOverlap:
	print>>fileout, ">%s" %(key)
	print>>fileout, (dictPotentialOverlap.get(key))
Phyca11_dict1={}
Phyca11_dict2={}
iname3=raw_input('Enter path to the P. capsici predicted protein file: ')
entries_2 = quick_FASTA_reader(iname3)
for name, seq in entries_2:
    data2=name.split('|')
    Identifier=int(data2[2])
    Phyca11_dict1[Identifier]=seq
    Phyca11_dict2[Identifier]=seq

Newdict={}
for keya in Phyca11_dict1:
    for keyb in dictPotentialOverlap:
        if int(keya)==dictPotentialOverlap[keyb][6]:
            try:
                Phyca11_PcRxLR="%s,%s"%(keya, keyb)
                Newdict[Phyca11_PcRxLR]=Phyca11_dict1.get(keya)
                Phyca11_dict2.pop(keya)
            except:
import random
from Bio.SeqUtils import quick_FASTA_reader


file_handle=raw_input('Enter input filepath of the P. infestans and P. capsici trimmed sequences: ') 
entries = quick_FASTA_reader(file_handle)

seqdict={}
entries=quick_FASTA_reader(file_handle)
for name,  seq in entries:
    seqdict[name]=seq
        
bootstrapdict_1={}
while len(bootstrapdict_1)<30000:
    entry=random.choice(seqdict.items())
    key='%s'%(entry[0])
    value='%s'%(entry[1])
    bootstrapdict_1[key]=value

bootstrapdict_2={}
for name,  seq in entries:
    entry=random.choice(seqdict.items())
    key='%s'%(entry[0])
    value='%s'%(entry[1])
    bootstrapdict_2[key]=value

bootstrapdict_3={}
for name,  seq in entries:
    entry=random.choice(seqdict.items())
    key='%s'%(entry[0])
    value='%s'%(entry[1])
# IMPORTS


from Bio.Alphabet import IUPAC
from Bio.Seq import Seq ##
from Bio.SeqRecord import SeqRecord ##
from Bio.SeqUtils import quick_FASTA_reader
from Bio import SeqIO

# This part is made by looking at LP's truncate_core_aa.py

import re
regex = re.compile('R.LR.{,60}?[ED][ED][KR]|R.LR.{,40}?[QD][QD]K|R.LR') #define the RxLR EER motif, looks also for alternative of the EER such as DDK, if the EER is not their, the truncation will occur after the RxLR motif
#the regex has been changed, {,60} alone would look for the longest sequence possible between RxLR and EER, whereas {,60}? will take the smallest distance between RxLR and EER, so if there is two EER motif, the first one will be used
file_handle='D:\Databases\All_RxLR_121216_aa_simpleName.fasta' #ask for the fasta file directory, the input file should be a fasta file with simple names for each sequence
entries = quick_FASTA_reader(file_handle) #open the fasta file
oname= 'D:\Databases\All_RxLR_121216_aa_simpleName_EER_trunc20131014.fasta' #ask for the output file directory, the file will be created by the script
reportfilepath='D:\Databases\Report_EER_trunc20131014.fasta'#create a file containing all the sequence that do not have a RxLR motif as defined by the regex
reportfile=open(reportfilepath, 'w')
print>>reportfile, 'This file contain the sequences that do not have a predicted RxLR motif'

# make a dictionary with the positions where to split the sequences
splitdict = {}
for name, seq in entries:
    try:
        match = re.search(regex, seq) #search for the RxLR EER motif
        motif = match.group() #print the seq from the start codon to the EER 
        span = match.span() #print the position of the RxLR EER motif
        print("Name: %s, Size: %s, Motif: %s, End: %s"%(name,len(seq), motif, span)) # some things to see if everything is working fine
        splitdict[name] = span[1] #add the position of the end of the RxLR EER motif in the dictionary
    except: #this line is their to avoid crash when the program encounter a non RxLR effector (effector that does not match with the definition given in Whisson et al 2007)