def Run2(flist): aps = AllPossStrings('ACGT', 7) NF = len(flist) # the number of files for i in range(NF): print 'Working on ', flist[i] # get the data dna = fasta.Fasta('bacteria/' + flist[i]) # count the number of genes NG = len(dna) # if there are multiple genes then combine if NG > 1: t = [] for j in range(NG): t.append(dna[j][1]) st = ''.join(t) del t else: st = dna[0][1] del dna # for every million bases make a plot NP = int(len(st) / 1000000) #if NP > 3: #NP = 3 for j in range(NP): print '\tPortion', j, 'of', NP ctr = Counter(aps, st[j * 1000000:j * 1000000 + 1000000]) akando.a2i(ctr).save('work/chaos' + str(i) + 'c' + str(j) + '.gif')
def design_probes(self,input): # setup some log formating so we have a record of activity time_str = strftime("%a, %d %b %Y %H:%M:%S", localtime()) print "Design probes request: %s" % time_str inseq = fasta.Fasta(input.inseq,strflag=True) # We'll put in the target Tm range in here manually (and the targetTm). # If we want to make this an adjustable option later, # we can do that here without having to refactor a bunch # of downstream code. targetRange = [-26.0, -20.0] targetTm = -23.0 results = find_probes.designV4(inseq, input.noligos, input.oligo_length, input.spacer_length, input.maskingflag, input.species, targetTm, targetRange) alignment = probe_design.alignOutput(results['masked_seq'], results['output'][-1][1],input.oligo_length) # We have to replace ending spaces with non-whitespace characters # so the XML serializing wont cut out all the work we did to # nicely align the positions of the probes and labels against inseq align = Alignment() align.raw_sequence = '/' + inseq.one_line() + '/' # masked is alignment[0] align.probe_oligos = '/' + alignment[1][0:-1] + '/' align.labels = '/' + alignment[2][0:-1] + '/' maxoligos = results['output'][-1] # scores, [1] matches [2] oligos maxoligos = probe_design.probeNames(maxoligos[2],input.probeprefix) olis = [] for i in range(0,len(maxoligos)): # for each designed oligo oli = ProbeOligo() oli.GC = maxoligos[i][0] oli.outseq = maxoligos[i][1] oli.label = maxoligos[i][2] olis.append(oli) output = FindProbesOutput() output.alignment = align output.oligos = olis print "Successful design of %d oligos" % len(olis) return output
def extract(filename): global fcount, ftotal fcount += 1 #print "\r%i/%i" % (fcount, ftotal), print "%i/%i" % (fcount, ftotal), exti = filename.rfind('.') ext = filename[exti:] path, query = os.path.split(filename[:exti]) fhits = fasta.read_from(filename) print filename, exti, ext, path, query if query in fhits: fquery = fasta.Fasta() fquery.add_seq(fhits[query]) fhits.remove(query) fhits.save_to(os.path.join(path, query + '-hits' + ext)) fquery.save_to(os.path.join(path, query + '-query' + ext))
def recombine(self, sequences): ''' Recombine the sequences with the provided recombinations >>> str(vdj_recombination().recombine([fasta.Fasta('a', 'AATTAT'),\ fasta.Fasta('b', 'GGGACACAT'),\ fasta.Fasta('c', 'ATAGATATGA')])) '>a 0//0 b 0//0 c\\nAATTAT\\nGGGACACAT\\nATAGATATGA\\n\\n' >>> str(vdj_recombination(deletions=[(lambda: 2)]).recombine([fasta.Fasta('a', 'AATTAT'),\ fasta.Fasta('b', 'GGGACACAT'),\ fasta.Fasta('c', 'ATAGATATGA')])) '>a 2//2 b 2//2 c\\nAATT\\nGACAC\\nAGATATGA\\n\\n' ''' name = '' seq = '' insertions = self.insertions * (len(sequences) - 1) deletions = self.deletions * (len(sequences) * 2 - 2) process = self.processing * (len(sequences)) for i, sequence in enumerate(sequences): nb_deletions_start = 0 nb_deletions_end = 0 N_insertions = '' sequence.seq = sequence.seq.translate(None, '.') if i > 0: # Start deletion nb_deletions_start = deletions[2 * i - 1]() name += '/%d ' % nb_deletions_start name += sequence.name if i < len(sequences) - 1: # End deletion nb_deletions_end = deletions[2 * i]() N_insertions = random_sequence(['A', 'C', 'G', 'T'],\ insertions[i]()) name += ' %d/%s' % (nb_deletions_end, N_insertions) nb_deletions_end = -nb_deletions_end if nb_deletions_end > 0 else None seq += process[i](sequence.seq[nb_deletions_start:nb_deletions_end] ) + "\n" + N_insertions + "\n" return fasta.Fasta(name, seq)
#!/usr/bin/env python import sys import os import fasta splitext = os.path.splitext if __name__ == '__main__': for filename in sys.argv[1:]: f = fasta.Fasta() f.read_from(filename) for s in f: s.sequence.replace('NNN', '') basename, ext = splitext(filename) f.save_to(basename + '_N_removed' + ext)
import fasta import bowtie_search as bts import time hot = fasta.Fasta("HOTAIR.txt") tstart = time.time() hits = bts.align(hot, 16, 'humanPseudo') print "%f seconds" % (time.time() - tstart)
import bowtie_search import fasta import seq import re import math import probe_design lgr = fasta.Fasta('Lgr5.txt') shark_tank = fasta.Fasta('shark_tank.fa') for i in range(5,20): hts = bowtie_search.align_for_hits(shark_tank,i,'humanReference') sm1 = sum(hts)*1.0/len(hts) hts = bowtie_search.align_for_hits(lgr,i,'humanReference') sm2 = sum(hts)*1.0/len(hts) print(i,sm1,sm2,sm1/sm2) for i in range(5,20): hts = bowtie_search.align_for_hits(shark_tank,i,'humanPseudo') sm1 = sum(hts)*1.0/len(hts) hts = bowtie_search.align_for_hits(lgr,i,'humanPseudo') sm2 = sum(hts)*1.0/len(hts) print(i,sm1,sm2,sm1/sm2) for i in range(5,20): hts = bowtie_search.align_for_hits(shark_tank,i,'humanMito') sm1 = sum(hts)*1.0/len(hts) hts = bowtie_search.align_for_hits(lgr,i,'humanMito') sm2 = sum(hts)*1.0/len(hts)
# Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))]) # get sorted list of contigs by length lengths = [(contigs[x].length, x) for x in contigs] lengths.sort(reverse=True) cn = sys.argv[2] # output BED tracks cf = open(outdir + '/contigs.bed', 'w') gf = open(outdir + '/gaps.bed', 'w') cstart = 0 gn = 1 for length, name in lengths: print >> cf, cn, cstart, cstart + length, name, 1000, '+' cstart = cstart + length print >> gf, cn, cstart, cstart + 100, 'gap%s' % gn cstart = cstart + len(gap) gn += 1 cf.close() gf.close() chromseq = gap.join([contigs[name].seq for length, name in lengths]) build = fasta.Fasta(cn, None, chromseq) print build
sys.exit(1) if o.gff2 is not None and o.gff3 is not None: print "Specify either gff2 or gff3 but not both." sys.exit(1) if o.fas is None: print "Specify the fasta database file." sys.exit(1) if o.output is None: o.output = sys.stdout else: o.output = file(o.output, "w") fas = fasta.Fasta() fas.read_from(o.fas) if o.gff2: gff = parse_gff2.parse_gff2(o.gff2) else: gff = parse_gff3.parse_gff3(o.gff3) try: l = [ fasta.Sequence( g.reference_sequence + ' ' + g.start + ' ' + g.stop + ' ' + g.strand, fas[g.reference_sequence].sequence[int(g.start) - 1:int(g.stop)]) for g in gff ]