Beispiel #1
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Beispiel #2
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Beispiel #3
0
def genomebg(infile,outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD   = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD,tmpfsa,linelen=1000000000)
    CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile)
    FID = os.popen('( %s ;) 2>&1'%CMD,'r')
    for line in FID.readlines(): print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Beispiel #4
0
def genomebg(infile, outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD, tmpfsa, linelen=1000000000)
    CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile)
    FID = os.popen('( %s ;) 2>&1' % CMD, 'r')
    for line in FID.readlines():
        print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Beispiel #5
0
    def train_final(self, model, fg, bg, N, beta):
        input_seqs = []
        for s in fg:
            iseq = self.all_probes[s].upper()
            iseq = re.sub(";","",iseq)
            if (re.search("N", iseq)):
                iseq = re.sub("N","",iseq)
            if (len(iseq)>0): input_seqs.append(iseq)

        if (self.refine):
            final_motif = self.train_model(model, input_seqs, beta)        
        else:
            final_motif = self.models[model]
        train_pos = self.get_LLRs(final_motif, fg)
        train_neg = self.get_LLRs(final_motif, bg)
        over_sampled_positive = self.SMOTE([train_pos], N, N)[0]

        #Train SVM to classify our training set
        c_vals = [1.0e-10, 1.0e-4, 1.0e-3, 1.0e-2, 0.05, 0.1, 1.0, 10.0, 100.0]
        best_classifier = None        
        best_err = 1.0
        for c in c_vals:
            classifier = self.SVM_train(over_sampled_positive, train_neg, c)
            train_err = self.SVM_test(classifier, over_sampled_positive, train_neg)
            if (train_err<best_err):
                best_err = train_err
                best_classifier = classifier
        (train_err, fp, fn) = self.SVM_test(best_classifier, train_pos, [], 1)
        if (self.dump):
            motif = {}
            no_motif = {}
            for name, val in zip(fg,train_pos):
                train_err = self.SVM_test(best_classifier, [val], [])
                if (train_err):
                    no_motif[name] = self.all_probes[name]
                else:
                    motif[name] = self.all_probes[name]
            motif_fsa = self.motif_file.split('.')[0] + '.pos.fsa'
            no_motif_fsa = self.motif_file.split('.')[0] + '.neg.fsa'
            Fasta.write(motif, motif_fsa)
            Fasta.write(no_motif, no_motif_fsa)
        return((final_motif, best_classifier, fn))
Beispiel #6
0
from TAMO.seq import Fasta


fasFile = '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.fas'
oFile1= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.sample2.fas'
oFile2= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.test2.fas'

firstDic, secDic = Fasta.random_split(fasFile,0.25)

Fasta.write(firstDic,oFile1)
Fasta.write(secDic,oFile2)

print 'done'
Beispiel #7
0
oligoType = 'control' # 'match' or 'control'
assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".'

# Load miRNA fastas into dict.
miRNAs = Fasta.file2dict(miRNAFile)

# Create new dict for seeds.
seeds = {}

# 1) Cycle through miRNA dict taking 7mers starting at pos 1 
#    and then pos2. Adapt key to reflect which. 
# 2) Convert to all uppers and convert U's to T's
# 3) If oligoType == 'match', rvcmp each 7mer and adapt key
#    to reflect which.
for miRNA in miRNAs:
    pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T')
    pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T')


    if oligoType == 'match':
        seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed)
        seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed)
    else:
        seeds[miRNA+'_ctrl_pos1'] = pos1_seed
        seeds[miRNA+'_ctrl_pos2'] = pos2_seed
        
# Write out seed dict as fasta. 
Fasta.write(seeds,seedFile)

print "Done."
Beispiel #8
0
mkdirp(opts.out_dir)
    

for i in range(len(randClusterLists)):
    oFileName = args[0].replace('.txt','randomGeneNames_%s.txt' % (i)).split('/')[-1]
    oFile = open('%s/%s' % (opts.out_dir,oFileName), 'w')
    for name in randClusterLists[i]:
        oFile.write(name+'\n')
    oFile.close()
    # --- If Asked, Create Fastas ---
    if opts.make_fasta:
        fNames  = map(lambda l: l.strip(),open('%s/%s' % (opts.out_dir,oFileName), 'rU').readlines())
        fastas = {}
        for f in fNames:
            fastas[f] = totalSeqs[f]
        Fasta.write(fastas,'%s/%s' % (opts.out_dir,oFileName.replace('.txt','.fas')))
    del(oFile)
    
print 'Done.'


            
print "Original list:"
origLens = []
for g in geneNames:
    origLens.append(len(totalSeqs[g]))
origLens.sort()
print origLens
print 'Avg: %.3f' % (average(origLens))
    
print "Randomized lists:"
Beispiel #9
0
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False):
    start=-3
    hang='NNN'

    match=[]

    #find CCGG positions using Fasta file
    fa=open(fafile)
    for line in fa:
        l=line.strip('\n')
        if l[0]=='>':
            ch=l[1:]
            continue
        if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN':
            start+=len(l)
            hang=l[-3:]
            continue
        else:
            seq=hang+l
            mers=[seq[x:(x+4)] for x in range(len(seq)-4)]
            for i,m in enumerate(mers):
                if m=='ccgg': match.append(start+i)
            hang=seq[-3:]
            start+=len(l)

    print len(match)
    
    fa.close()
    FRAG=[]
    
    #find cut sites 40-220bp and save as tuple
    for x,y in zip(match[:-1],match[1:]):
        d=y-x
        if d>40 and d<250: FRAG.append((x,y))

    print len(FRAG)

    #nibDB the cut sites 40bp 5'-3' and
    #save each as a pair of Fasta items with keys chr:position(strand)
    seq_dict={}
    ids,loci=[],[]
    BF=[]
    for x,y in FRAG:
        if bedFrag: BF.append([ch,str(x+1),str(y+3)])
        #for x
        start=x+1
        stop=x+41
        key=ch+':'+str(start)+'+'
        loc=(ch,start,stop,'+')
        ids.append(key)
        loci.append(loc)
        
        #for y
        start=y-37
        stop=y+3
        key=ch+':'+str(stop)+'-'
        loc=(ch,start,stop,'-')
        ids.append(key)
        loci.append(loc)

    if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t')
    if genome=='hg18':  DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/')
    else:  DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir'])
    fa_ids,seqs=DB.get_fasta_batch(loci)
    for id,seq in zip(ids,seqs):
        if convert: biseq=seq.replace('c','t')
        else: biseq=seq
        if id[-1]=='+':
            seq_dict[id]=biseq
        else:
            #seq_dict[id]=seq[::-1]
            seq_dict[id]=biseq
    Fasta.write(seq_dict,seqfile)
Beispiel #10
0

#  Instantiate the fasta rec lists
originalFastaDict = Fasta.load(originalFastaDict)

#  New dict to catch copied seqObjs
desiredFastaDict = {}

for rec in desiredFastaList:
    if originalFastaDict.has_key(rec):    
        desiredFastaDict[rec] = originalFastaDict[rec]
    else:
        print rec+' not found in source fasta list!'
    

# Hard Mask if requested
if hardMask:
    for x in desiredFastaDict:
        desiredFastaDict[x] = desiredFastaDict[x].replace('a','N')
        desiredFastaDict[x] = desiredFastaDict[x].replace('c','N')
        desiredFastaDict[x] = desiredFastaDict[x].replace('g','N')
        desiredFastaDict[x] = desiredFastaDict[x].replace('t','N')
else:
    for x in desiredFastaDict:
        desiredFastaDict[x] = desiredFastaDict[x].upper() # make sure all letters are uppercase for downstream compatibility

#  Write selected recs to outFile

Fasta.write(desiredFastaDict, outFile)

print "Done."
Beispiel #11
0
from gusPyCode.MDAP_proj.MDAP_defs import shuffleSeqDict
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import softMaskDict2HardMask
from time import time
from gusPyCode.defs.mosqData import promoterSeqPaths

# User Variables:
inFile   = promoterSeqPaths.Aa_2000bpUp_softMasked
outFile  = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.UnMasked.geneStrand.shuffledSeqs.1.fas'
hardMask = None


d = Fasta.load(inFile)
#d = {1:'AACTGCANACTGACNNNACTGATGNNN'}

if not hardMask:
    for x in d:
        d[x] = d[x].upper()

t1 = time()
sD = shuffleSeqDict(d)
t2 = time()

Fasta.write(sD,outFile)

print 'Shuffling took %.2f min.' % ((float(t2)-t1)/60)
Beispiel #12
0
import sys
import time
from TAMO.seq import Fasta

"""Takes a fastaFilePath and a fraction between 0 and 1.  Returns two fasta
files containing random sequences from fastaFilePath split randomly into files
of size 'fraction' and 1-'fraction'.  Files named as: fastaFilePath_fraction.Date_Time.fas"""

assert len(sys.argv[1:]) == 2, \
       'usage = %s fastaFilePath fraction<0 to 1>' % (sys.argv[0].split('/')[-1])
assert float(sys.argv[2]) <= 1 and float(sys.argv[2]) >= 0, \
       'usage = %s fastaFilePath fraction<0 to 1>' % (sys.argv[0].split('/')[-1])


filePath  = sys.argv[1]
frac      = float(sys.argv[2])
versionID = time.ctime().split(' ')
versionID = '%s%s_%s' % (versionID[1],versionID[2],versionID[3].replace(':','-'))


dict1,dict2 = Fasta.random_split(filePath,frac)

out1 = '%s_%s_%s.fas' % (filePath.split('/')[-1].rstrip('.fas'),frac,versionID)
out2 = '%s_%s_%s.fas' % (filePath.split('/')[-1].rstrip('.fas'),1-frac,versionID)

Fasta.write(dict1,out1,linelen=100)
Fasta.write(dict2,out2,linelen=100)