Ejemplo n.º 1
0
    def get_offset(self,
                   ref_file='~/References/HIV-HXB2.fasta',
                   gene='protease'):
        '''
        '''
        from pythonlib import Alignment
        import os

        outfile = 'ppp.tmp'
        start, stop = gene_coord[gene]
        usa_seq = ref_file + '[%d:%d]' % (start, stop)
        Alignment.needle_align(usa_seq,
                               'asis:%s' % self.cons,
                               outfile,
                               go=10.0,
                               ge=0.5)
        tal = Alignment.alignfile2dict([outfile], 'get_offset', 10.0, 0.5)
        os.remove(outfile)
        ka = tal.keys()[0]
        this = tal[ka]['asis']
        this.summary()
        self.offset = this.start
        print('Offset consensus w.r.t',
              ref_file,
              'is',
              self.offset,
              file=sys.stderr)
        return
Ejemplo n.º 2
0
def find_closest_here(reads_file):
    '''
    The diff_thresh has been set to 0.025 because even when aligning error-free reads
    to the original haplotypes, the distribution of differences of the best 2
    identities goes from 0.023 to 0.091 (~9%)
    '''
    from pythonlib import Alignment
    import tempfile
    import subprocess
    import heapq
    import operator

    # diff_thresh = 0.025
    # abs_thresh = 0.85

    ref_file = 'ref.fasta'
    out = tempfile.NamedTemporaryFile()
    outname = out.name

    cmline = 'needle -asequence %s -bsequence %s \
              -gapopen 6.0 -gapextend 3.0 -auto -adesshow3 -out %s -aformat3 markx10' \
        % (ref_file, reads_file, outname)
    subprocess.call(cmline, shell=True)
    dd = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False)
    kh = dd.keys()[0]
    d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False)[kh]
    out.close()
    this = []
    mm = []
    ident_2 = []
    ig = []
    for k, v in d.items():
        v.summary()
        ig.append(v.int_gaps)
        this.append(float(v.ident) / (v.stop - v.start + 1))
        mm.append(v.mismatch)  #v.stop - v.start + 1 - v.ident
        ident_2.append(float(v.ident) / (v.stop - v.start + 1 - v.int_gaps))
    return ig, this, mm, ident_2
Ejemplo n.º 3
0
    def get_cons(self, plurality=0.1):
        '''Consensus by running EMBOSS cons
        '''
        import subprocess
        import os
        import itertools
        from pythonlib import Alignment

        cline = 'cons -sequence %s -stdout -auto' % self.sup_file
        cline += ' -plurality %f' % plurality

        p = subprocess.Popen(cline, shell=True, bufsize=1024, \
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, \
                             close_fds=True)

        sc = list(SeqIO.parse(p.stdout, 'fasta'))[0].seq.tostring().upper()
        strcons = sc.replace('N', '')

        outfile = 'tmp.tmp'
        Alignment.needle_align(self.ref_file, 'asis:%s' % strcons, \
                                   outfile, go=10.0, ge=0.5)
        tal = Alignment.alignfile2dict([outfile], 'ref_cons_alignment', 10.0,
                                       0.5)
        os.remove(outfile)
        ka = tal.keys()[0]
        this = tal[ka]['asis']
        it_pair = itertools.izip(this.seq_a, this.seq_b)

        this_seq = []
        while True:
            try:
                p = it_pair.next()
            except StopIteration:
                break
            if p is None:
                break
            if p[1] == '-':
                assert p[0] != '-', 'gap-gap?'
                this_seq.append(p[0])
            elif p[0] != '-':
                this_seq.append(p[1])
        ws = ''.join(this_seq)

        return ws
Ejemplo n.º 4
0
def find_closest(hr):
    '''
    The diff_thresh has been set to 0.025 because even when aligning error-free reads
    to the original haplotypes, the distribution of differences of the best 2
    identities goes from 0.023 to 0.091 (~9%)
    '''
    from pythonlib import Alignment
    import tempfile
    import subprocess
    import heapq
    import operator
    
    diff_thresh = 0.0125
    abs_thresh = 0.1

    # ref_file = './ref.fas'
    out = tempfile.NamedTemporaryFile()
    outname = out.name
    hap, ref_file = hr
    cmline = 'needle -asequence asis:\'%s\' -bsequence %s \
              -gapopen 10.0 -gapextend 1.0 -auto -adesshow3 -out %s -aformat3 markx10' \
        % (hap, ref_file, outname)
    subprocess.call(cmline, shell=True)
    d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose = False)['asis']
    out.close()
    this = {}
    mm = {}
    gaps = {}
    for k, v in d.items():
        v.summary()
        this[v.id_b] = float(v.mismatch)/(v.stop - v.start + 1) #float(v.ident)/(v.stop - v.start + 1)
        mm[v.id_b] = v.mismatch # v.stop - v.start + 1 - v.ident
        gaps[v.id_b] = v.int_gaps
        
    best2 = heapq.nsmallest(2, this.items(), operator.itemgetter(1))
    rel_delta = (best2[1][1] - best2[0][1])#/best2[0][1]
    
    if  rel_delta >= diff_thresh and best2[0][1] <= abs_thresh:
        return best2[0][0], best2[0][1], mm[best2[0][0]], gaps[best2[0][0]]
    else:
        return None, gaps[best2[0][0]]
Ejemplo n.º 5
0
    def alignedvariants(self, threshold=0.9):
        import subprocess
        import re
        import itertools
        import hashlib
        from Bio.Emboss.Applications import NeedleCommandline
        from pythonlib import Alignment

        files = []
        var_dict = {}
        for i, s in enumerate(self.seq_obj):
            m_obj = re.search('posterior=(.*)\s*ave_reads=(.*)', s.description)
            post, ave_reads = map(float, (m_obj.group(1), m_obj.group(2)))
            if post < threshold or ave_reads < 1.:
                continue
            if post > 1.0:
                print('WARNING: posterior=', post, file=sys.stderr)
            outfile = 'tmp%d.needle' % i
            files.append(outfile)
            needle_cline = NeedleCommandline(asequence='asis:%s' % self.ref, bsequence='asis:%s' % s.seq.tostring().strip('-'), \
                                   outfile=outfile, gapopen=10.0, gapextend=0.5, aformat='markx10')
            needle_cline.auto = True

            try:
                retcode = subprocess.call(str(needle_cline), shell=True)
                if retcode < 0:
                    sys.exit('Child needle was terminated by signal %d' %
                             -retcode)

#               else:
#                   print >> sys.stderr, 'Child needle returned %i' % retcode
            except OSError:
                sys.exit('Execution of needle failed: %s' % ee)
                pass

            tal = Alignment.alignfile2dict([outfile],
                                           'support_seqs%d' % i,
                                           10.0,
                                           0.5,
                                           Verbose=False)
            os.remove(outfile)
            ka = tal.keys()[0]
            this = tal[ka]['asis']
            it_pair = itertools.izip(this.seq_a, this.seq_b)
            #this.summary()
            #start, stop = this.start, this.stop
            #it_pair = itertools.izip(this.seq_a[start-1:stop], this.seq_b[start-1:stop])

            this_seq = []
            while True:
                try:
                    p = it_pair.next()
                except StopIteration:
                    break
                if p is None:
                    break
                if p[1] == '-':
                    assert p[0] != '-', 'gap-gap?'
                    this_seq.append(p[0])
                elif p[0] != '-':
                    this_seq.append(p[1])
            ws = ''.join(this_seq)
            var_dict[ws] = var_dict.get(ws, 0) + ave_reads

        for k, v in var_dict.items():
            ts = Seq(k, IUPAC.unambiguous_dna)
            tsr = SeqRecord(ts, id = hashlib.sha224(k).hexdigest(), \
                            name='Reconstructed local hap')
            tsr.description = 'ave_reads=%f' % v
            self.dna_seqs.append(tsr)
        print('%d haplotypes have support >=%f'\
              % (len(files), threshold), file=sys.stderr)
        return self.dna_seqs
Ejemplo n.º 6
0
def count_codons(haps):

    import pickle
    from Bio.Seq import translate
    from operator import itemgetter
    from pythonlib import Alignment
    from pythonlib import mystats

    latex = False  # print latex table
    count = [{} for i in range(102)]
    oh = open('all.dat', 'w')
    hap_freq = {}
    degeneracy = {}
    mask_mupos = []  #[10, 11, 22, 25, 32, 46, 58, 62, 67, 74, 89]
    mupos = []
    # These sequences are HXB2 proteases
    wt_protease = 'PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'
    wt_protease_nt = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTA\
TTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTA\
TAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT'

    ac_res = map(align_codons, haps)

    protease = wt_protease
    for ar in ac_res:
        start, residues, freq = ar  # start here is human (from 1)
        start -= 1  # start here is pythonic (from 0)
        if start == None and residues == None: continue

        oh.write('%d %s\n' % (round(freq), wt_protease_nt[:start] + residues +
                              wt_protease_nt[len(residues) + start:]))

        if start % 3 == 0:
            read = residues
        elif start % 3 == 1:
            read = residues[2:]
        elif start % 3 == 2:
            read = residues[1:]
        try:
            aa = translate(read)  # Biopython
        except:
            print 'error: read', read
            continue

        if start % 3 == 0:
            start_a = start / 3 + 1
        if start % 3:
            start_a = start / 3 + 2

        stop_a = len(aa) + start_a + 1

        this_hap = str(protease[:start_a - 1] + aa + protease[stop_a - 2:])

        print this_hap.ljust(100), str(freq).ljust(
            8
        )  # this is used for resistance prediction, whole haplotype and reads
        for i, c in enumerate(this_hap):
            count[i + 1][c] = count[i + 1].get(c, 0) + freq
        Alignment.needle_align('asis:%s' % wt_protease, 'asis:%s ' % this_hap,
                               'tmp', 10.0, 0.5)
        d = Alignment.alignfile2dict(['tmp'], 'n', 10.0, 0.5,
                                     Verbose=False)['asis']['asis']
        os.remove('tmp')

        mutations = []

        for i, c in enumerate(zip(d.seq_a, d.seq_b)):
            pos = i + 1
            if '-' in c:
                continue
            if c[0] != c[1]:
                mutations.append(c[0] + str(pos) + c[1])
                if pos not in mask_mupos: mupos.append(pos)
        signature = ', '.join(mutations)
        hap_freq[signature] = hap_freq.get(signature, 0.0) + freq
        degeneracy[signature] = degeneracy.get(signature, 0) + 1
    print ''
    for k, v in hap_freq.items():
        print str(v).ljust(15), ' ', k
    mupos = sorted(mupos)
    spos = {}
    for i, j in enumerate(mupos):
        spos[j] = i

    hf_sorted = sorted(hap_freq.items(), key=itemgetter(1), reverse=True)
    tot_reads = sum([h[1] for h in haps])
    tot_hap = sum(hap_freq.values())

    print 'Tot reads after', tot_reads
    print 'Tot', tot_hap
    print 'Simpson\'s index on amino acid sequences = %f +/- %f' % mystats.Simpson(
        hap_freq.values())
    oh = open('degeneracy.pck', 'w')
    pickle.dump(degeneracy, oh)
    oh.close()

    for c in count:
        ts = sum(c.values())
        for k in c.keys():
            c[k] /= ts
    plot_variation(count)
    if not latex:
        return hf_sorted
    print ''
    print '|c' * (1 + len(spos))
    for i in mupos:
        print '%s%d & ' % (wt_protease[i - 1], i),
    print ''

    return hf_sorted