def SGDData(): root = TAMO.paths.SGDdir urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' files = ['chromosomal_feature/SGD_features.tab', 'chromosomal_feature/dbxref.tab', 'chromosomal_feature/chromosome_length.tab', 'sequence/GenBank/yeast_nrpep.fasta.gz', 'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz', ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab') ] chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split() files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] ) downloadfiles(root,urlroot,files) from TAMO.seq import Fasta print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)" D = {} for chr in chrs: _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr)) id, seq = _d.items()[0] if chr[0] == '0': chr = chr[1] D['chr%s %s'%(chr,id)] = seq Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
def genomebg(infile,outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD,tmpfsa,linelen=1000000000) CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile) FID = os.popen('( %s ;) 2>&1'%CMD,'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def genomebg(infile, outfile): EXE = MDSCAN_DIR + 'genomebg.linux' fsaD = Fasta.load(infile) tmpfsa = tempfile.mktemp() Fasta.write(fsaD, tmpfsa, linelen=1000000000) CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile) FID = os.popen('( %s ;) 2>&1' % CMD, 'r') for line in FID.readlines(): print line if FID.close(): print "Exited" os.unlink(tmpfsa)
def train_final(self, model, fg, bg, N, beta): input_seqs = [] for s in fg: iseq = self.all_probes[s].upper() iseq = re.sub(";","",iseq) if (re.search("N", iseq)): iseq = re.sub("N","",iseq) if (len(iseq)>0): input_seqs.append(iseq) if (self.refine): final_motif = self.train_model(model, input_seqs, beta) else: final_motif = self.models[model] train_pos = self.get_LLRs(final_motif, fg) train_neg = self.get_LLRs(final_motif, bg) over_sampled_positive = self.SMOTE([train_pos], N, N)[0] #Train SVM to classify our training set c_vals = [1.0e-10, 1.0e-4, 1.0e-3, 1.0e-2, 0.05, 0.1, 1.0, 10.0, 100.0] best_classifier = None best_err = 1.0 for c in c_vals: classifier = self.SVM_train(over_sampled_positive, train_neg, c) train_err = self.SVM_test(classifier, over_sampled_positive, train_neg) if (train_err<best_err): best_err = train_err best_classifier = classifier (train_err, fp, fn) = self.SVM_test(best_classifier, train_pos, [], 1) if (self.dump): motif = {} no_motif = {} for name, val in zip(fg,train_pos): train_err = self.SVM_test(best_classifier, [val], []) if (train_err): no_motif[name] = self.all_probes[name] else: motif[name] = self.all_probes[name] motif_fsa = self.motif_file.split('.')[0] + '.pos.fsa' no_motif_fsa = self.motif_file.split('.')[0] + '.neg.fsa' Fasta.write(motif, motif_fsa) Fasta.write(no_motif, no_motif_fsa) return((final_motif, best_classifier, fn))
from TAMO.seq import Fasta fasFile = '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.fas' oFile1= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.sample2.fas' oFile2= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.test2.fas' firstDic, secDic = Fasta.random_split(fasFile,0.25) Fasta.write(firstDic,oFile1) Fasta.write(secDic,oFile2) print 'done'
oligoType = 'control' # 'match' or 'control' assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".' # Load miRNA fastas into dict. miRNAs = Fasta.file2dict(miRNAFile) # Create new dict for seeds. seeds = {} # 1) Cycle through miRNA dict taking 7mers starting at pos 1 # and then pos2. Adapt key to reflect which. # 2) Convert to all uppers and convert U's to T's # 3) If oligoType == 'match', rvcmp each 7mer and adapt key # to reflect which. for miRNA in miRNAs: pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T') pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T') if oligoType == 'match': seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed) seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed) else: seeds[miRNA+'_ctrl_pos1'] = pos1_seed seeds[miRNA+'_ctrl_pos2'] = pos2_seed # Write out seed dict as fasta. Fasta.write(seeds,seedFile) print "Done."
mkdirp(opts.out_dir) for i in range(len(randClusterLists)): oFileName = args[0].replace('.txt','randomGeneNames_%s.txt' % (i)).split('/')[-1] oFile = open('%s/%s' % (opts.out_dir,oFileName), 'w') for name in randClusterLists[i]: oFile.write(name+'\n') oFile.close() # --- If Asked, Create Fastas --- if opts.make_fasta: fNames = map(lambda l: l.strip(),open('%s/%s' % (opts.out_dir,oFileName), 'rU').readlines()) fastas = {} for f in fNames: fastas[f] = totalSeqs[f] Fasta.write(fastas,'%s/%s' % (opts.out_dir,oFileName.replace('.txt','.fas'))) del(oFile) print 'Done.' print "Original list:" origLens = [] for g in geneNames: origLens.append(len(totalSeqs[g])) origLens.sort() print origLens print 'Avg: %.3f' % (average(origLens)) print "Randomized lists:"
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False): start=-3 hang='NNN' match=[] #find CCGG positions using Fasta file fa=open(fafile) for line in fa: l=line.strip('\n') if l[0]=='>': ch=l[1:] continue if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN': start+=len(l) hang=l[-3:] continue else: seq=hang+l mers=[seq[x:(x+4)] for x in range(len(seq)-4)] for i,m in enumerate(mers): if m=='ccgg': match.append(start+i) hang=seq[-3:] start+=len(l) print len(match) fa.close() FRAG=[] #find cut sites 40-220bp and save as tuple for x,y in zip(match[:-1],match[1:]): d=y-x if d>40 and d<250: FRAG.append((x,y)) print len(FRAG) #nibDB the cut sites 40bp 5'-3' and #save each as a pair of Fasta items with keys chr:position(strand) seq_dict={} ids,loci=[],[] BF=[] for x,y in FRAG: if bedFrag: BF.append([ch,str(x+1),str(y+3)]) #for x start=x+1 stop=x+41 key=ch+':'+str(start)+'+' loc=(ch,start,stop,'+') ids.append(key) loci.append(loc) #for y start=y-37 stop=y+3 key=ch+':'+str(stop)+'-' loc=(ch,start,stop,'-') ids.append(key) loci.append(loc) if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t') if genome=='hg18': DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/') else: DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir']) fa_ids,seqs=DB.get_fasta_batch(loci) for id,seq in zip(ids,seqs): if convert: biseq=seq.replace('c','t') else: biseq=seq if id[-1]=='+': seq_dict[id]=biseq else: #seq_dict[id]=seq[::-1] seq_dict[id]=biseq Fasta.write(seq_dict,seqfile)
# Instantiate the fasta rec lists originalFastaDict = Fasta.load(originalFastaDict) # New dict to catch copied seqObjs desiredFastaDict = {} for rec in desiredFastaList: if originalFastaDict.has_key(rec): desiredFastaDict[rec] = originalFastaDict[rec] else: print rec+' not found in source fasta list!' # Hard Mask if requested if hardMask: for x in desiredFastaDict: desiredFastaDict[x] = desiredFastaDict[x].replace('a','N') desiredFastaDict[x] = desiredFastaDict[x].replace('c','N') desiredFastaDict[x] = desiredFastaDict[x].replace('g','N') desiredFastaDict[x] = desiredFastaDict[x].replace('t','N') else: for x in desiredFastaDict: desiredFastaDict[x] = desiredFastaDict[x].upper() # make sure all letters are uppercase for downstream compatibility # Write selected recs to outFile Fasta.write(desiredFastaDict, outFile) print "Done."
from gusPyCode.MDAP_proj.MDAP_defs import shuffleSeqDict from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import softMaskDict2HardMask from time import time from gusPyCode.defs.mosqData import promoterSeqPaths # User Variables: inFile = promoterSeqPaths.Aa_2000bpUp_softMasked outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.UnMasked.geneStrand.shuffledSeqs.1.fas' hardMask = None d = Fasta.load(inFile) #d = {1:'AACTGCANACTGACNNNACTGATGNNN'} if not hardMask: for x in d: d[x] = d[x].upper() t1 = time() sD = shuffleSeqDict(d) t2 = time() Fasta.write(sD,outFile) print 'Shuffling took %.2f min.' % ((float(t2)-t1)/60)
import sys import time from TAMO.seq import Fasta """Takes a fastaFilePath and a fraction between 0 and 1. Returns two fasta files containing random sequences from fastaFilePath split randomly into files of size 'fraction' and 1-'fraction'. Files named as: fastaFilePath_fraction.Date_Time.fas""" assert len(sys.argv[1:]) == 2, \ 'usage = %s fastaFilePath fraction<0 to 1>' % (sys.argv[0].split('/')[-1]) assert float(sys.argv[2]) <= 1 and float(sys.argv[2]) >= 0, \ 'usage = %s fastaFilePath fraction<0 to 1>' % (sys.argv[0].split('/')[-1]) filePath = sys.argv[1] frac = float(sys.argv[2]) versionID = time.ctime().split(' ') versionID = '%s%s_%s' % (versionID[1],versionID[2],versionID[3].replace(':','-')) dict1,dict2 = Fasta.random_split(filePath,frac) out1 = '%s_%s_%s.fas' % (filePath.split('/')[-1].rstrip('.fas'),frac,versionID) out2 = '%s_%s_%s.fas' % (filePath.split('/')[-1].rstrip('.fas'),1-frac,versionID) Fasta.write(dict1,out1,linelen=100) Fasta.write(dict2,out2,linelen=100)