Example #1
0
def Run2(flist):
    aps = AllPossStrings('ACGT', 7)
    NF = len(flist)  # the number of files
    for i in range(NF):
        print 'Working on ', flist[i]
        # get the data
        dna = fasta.Fasta('bacteria/' + flist[i])
        # count the number of genes
        NG = len(dna)
        # if there are multiple genes then combine
        if NG > 1:
            t = []
            for j in range(NG):
                t.append(dna[j][1])
            st = ''.join(t)
            del t
        else:
            st = dna[0][1]
        del dna
        # for every million bases make a plot
        NP = int(len(st) / 1000000)
        #if NP > 3:
        #NP = 3
        for j in range(NP):
            print '\tPortion', j, 'of', NP
            ctr = Counter(aps, st[j * 1000000:j * 1000000 + 1000000])
            akando.a2i(ctr).save('work/chaos' + str(i) + 'c' + str(j) + '.gif')
Example #2
0
    def design_probes(self,input):
        # setup some log formating so we have a record of activity
        time_str = strftime("%a, %d %b %Y %H:%M:%S", localtime())
        print "Design probes request: %s" % time_str

        inseq = fasta.Fasta(input.inseq,strflag=True)
        
        # We'll put in the target Tm range in here manually (and the targetTm).
        # If we want to make this an adjustable option later,
        # we can do that here without having to refactor a bunch
        # of downstream code.
        targetRange = [-26.0, -20.0]
        targetTm = -23.0
        
        results = find_probes.designV4(inseq,
                                    input.noligos,  
                                    input.oligo_length,
                                    input.spacer_length,
                                    input.maskingflag,
                                    input.species,
                                    targetTm,
                                    targetRange)

        alignment = probe_design.alignOutput(results['masked_seq'],
                                     results['output'][-1][1],input.oligo_length)
        # We have to replace ending spaces with non-whitespace characters
        # so the XML serializing wont cut out all the work we did to 
        # nicely align the positions of the probes and labels against inseq
        align = Alignment() 
        align.raw_sequence = '/' + inseq.one_line() + '/'  # masked is alignment[0]
        align.probe_oligos = '/' + alignment[1][0:-1] + '/'
        align.labels = '/' + alignment[2][0:-1] + '/'

        maxoligos = results['output'][-1] # scores, [1] matches [2] oligos
        maxoligos = probe_design.probeNames(maxoligos[2],input.probeprefix)
        olis = []
        for i in range(0,len(maxoligos)):  # for each designed oligo
            oli = ProbeOligo()
            oli.GC = maxoligos[i][0]
            oli.outseq = maxoligos[i][1]
            oli.label = maxoligos[i][2]
            olis.append(oli)

        output = FindProbesOutput()
        output.alignment = align
        output.oligos = olis
        
        print "Successful design of %d oligos" % len(olis)
        return output
Example #3
0
def extract(filename):
    global fcount, ftotal
    fcount += 1
    #print "\r%i/%i" % (fcount, ftotal),
    print "%i/%i" % (fcount, ftotal),
    exti = filename.rfind('.')
    ext = filename[exti:]
    path, query = os.path.split(filename[:exti])
    fhits = fasta.read_from(filename)
    print filename, exti, ext, path, query
    if query in fhits:
        fquery = fasta.Fasta()
        fquery.add_seq(fhits[query])
        fhits.remove(query)
        fhits.save_to(os.path.join(path, query + '-hits' + ext))
        fquery.save_to(os.path.join(path, query + '-query' + ext))
Example #4
0
    def recombine(self, sequences):
        '''
        Recombine the sequences with the provided recombinations
        >>> str(vdj_recombination().recombine([fasta.Fasta('a', 'AATTAT'),\
                                           fasta.Fasta('b', 'GGGACACAT'),\
                                           fasta.Fasta('c', 'ATAGATATGA')]))
        '>a 0//0 b 0//0 c\\nAATTAT\\nGGGACACAT\\nATAGATATGA\\n\\n'
        >>> str(vdj_recombination(deletions=[(lambda: 2)]).recombine([fasta.Fasta('a', 'AATTAT'),\
                                           fasta.Fasta('b', 'GGGACACAT'),\
                                           fasta.Fasta('c', 'ATAGATATGA')]))
        '>a 2//2 b 2//2 c\\nAATT\\nGACAC\\nAGATATGA\\n\\n'
        '''
        name = ''
        seq = ''
        insertions = self.insertions * (len(sequences) - 1)
        deletions = self.deletions * (len(sequences) * 2 - 2)
        process = self.processing * (len(sequences))
        for i, sequence in enumerate(sequences):
            nb_deletions_start = 0
            nb_deletions_end = 0
            N_insertions = ''
            sequence.seq = sequence.seq.translate(None, '.')

            if i > 0:
                # Start deletion
                nb_deletions_start = deletions[2 * i - 1]()
                name += '/%d ' % nb_deletions_start
            name += sequence.name
            if i < len(sequences) - 1:
                # End deletion
                nb_deletions_end = deletions[2 * i]()
                N_insertions = random_sequence(['A', 'C', 'G', 'T'],\
                                               insertions[i]())
                name += ' %d/%s' % (nb_deletions_end, N_insertions)
            nb_deletions_end = -nb_deletions_end if nb_deletions_end > 0 else None
            seq += process[i](sequence.seq[nb_deletions_start:nb_deletions_end]
                              ) + "\n" + N_insertions + "\n"
        return fasta.Fasta(name, seq)
Example #5
0
#!/usr/bin/env python

import sys
import os
import fasta

splitext = os.path.splitext

if __name__ == '__main__':
    for filename in sys.argv[1:]:
        f = fasta.Fasta()
        f.read_from(filename)
        for s in f:
            s.sequence.replace('NNN', '')
        basename, ext = splitext(filename)
        f.save_to(basename + '_N_removed' + ext)
Example #6
0
import fasta
import bowtie_search as bts
import time

hot = fasta.Fasta("HOTAIR.txt")
tstart = time.time()
hits = bts.align(hot, 16, 'humanPseudo')

print "%f seconds" % (time.time() - tstart)
Example #7
0
import bowtie_search
import fasta
import seq
import re
import math
import probe_design

lgr = fasta.Fasta('Lgr5.txt')
shark_tank = fasta.Fasta('shark_tank.fa')

for i in range(5,20):
   hts = bowtie_search.align_for_hits(shark_tank,i,'humanReference')
   sm1 = sum(hts)*1.0/len(hts)
   hts = bowtie_search.align_for_hits(lgr,i,'humanReference')
   sm2 = sum(hts)*1.0/len(hts)
   print(i,sm1,sm2,sm1/sm2)
 

for i in range(5,20):
   hts = bowtie_search.align_for_hits(shark_tank,i,'humanPseudo')
   sm1 = sum(hts)*1.0/len(hts)
   hts = bowtie_search.align_for_hits(lgr,i,'humanPseudo')
   sm2 = sum(hts)*1.0/len(hts)
   print(i,sm1,sm2,sm1/sm2)
 

for i in range(5,20):
   hts = bowtie_search.align_for_hits(shark_tank,i,'humanMito')
   sm1 = sum(hts)*1.0/len(hts)
   hts = bowtie_search.align_for_hits(lgr,i,'humanMito')
   sm2 = sum(hts)*1.0/len(hts)
Example #8
0
# Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna
contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))])

# get sorted list of contigs by length
lengths = [(contigs[x].length, x) for x in contigs]
lengths.sort(reverse=True)

cn = sys.argv[2]

# output BED tracks
cf = open(outdir + '/contigs.bed', 'w')
gf = open(outdir + '/gaps.bed', 'w')

cstart = 0
gn = 1
for length, name in lengths:
    print >> cf, cn, cstart, cstart + length, name, 1000, '+'
    cstart = cstart + length
    print >> gf, cn, cstart, cstart + 100, 'gap%s' % gn
    cstart = cstart + len(gap)
    gn += 1

cf.close()
gf.close()

chromseq = gap.join([contigs[name].seq for length, name in lengths])
build = fasta.Fasta(cn, None, chromseq)

print build
        sys.exit(1)

    if o.gff2 is not None and o.gff3 is not None:
        print "Specify either gff2 or gff3 but not both."
        sys.exit(1)

    if o.fas is None:
        print "Specify the fasta database file."
        sys.exit(1)

    if o.output is None:
        o.output = sys.stdout
    else:
        o.output = file(o.output, "w")

    fas = fasta.Fasta()
    fas.read_from(o.fas)

    if o.gff2:
        gff = parse_gff2.parse_gff2(o.gff2)
    else:
        gff = parse_gff3.parse_gff3(o.gff3)

    try:
        l = [
            fasta.Sequence(
                g.reference_sequence + ' ' + g.start + ' ' + g.stop + ' ' +
                g.strand, fas[g.reference_sequence].sequence[int(g.start) -
                                                             1:int(g.stop)])
            for g in gff
        ]