Ejemplos de Alignment en Python, ejemplos de pythonlib.Alignment en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: LocalStructure.py Proyecto: ozagordi/old-svn-sandbox

    def get_offset(self,
                   ref_file='~/References/HIV-HXB2.fasta',
                   gene='protease'):
        '''
        '''
        from pythonlib import Alignment
        import os

        outfile = 'ppp.tmp'
        start, stop = gene_coord[gene]
        usa_seq = ref_file + '[%d:%d]' % (start, stop)
        Alignment.needle_align(usa_seq,
                               'asis:%s' % self.cons,
                               outfile,
                               go=10.0,
                               ge=0.5)
        tal = Alignment.alignfile2dict([outfile], 'get_offset', 10.0, 0.5)
        os.remove(outfile)
        ka = tal.keys()[0]
        this = tal[ka]['asis']
        this.summary()
        self.offset = this.start
        print('Offset consensus w.r.t',
              ref_file,
              'is',
              self.offset,
              file=sys.stderr)
        return

Ejemplo n.º 2

0

Mostrar archivo

Archivo: LocalStructure.py Proyecto: ozagordi/old-svn-sandbox

    def get_cons(self, plurality=0.1):
        '''Consensus by running EMBOSS cons
        '''
        import subprocess
        import os
        import itertools
        from pythonlib import Alignment

        cline = 'cons -sequence %s -stdout -auto' % self.sup_file
        cline += ' -plurality %f' % plurality

        p = subprocess.Popen(cline, shell=True, bufsize=1024, \
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, \
                             close_fds=True)

        sc = list(SeqIO.parse(p.stdout, 'fasta'))[0].seq.tostring().upper()
        strcons = sc.replace('N', '')

        outfile = 'tmp.tmp'
        Alignment.needle_align(self.ref_file, 'asis:%s' % strcons, \
                                   outfile, go=10.0, ge=0.5)
        tal = Alignment.alignfile2dict([outfile], 'ref_cons_alignment', 10.0,
                                       0.5)
        os.remove(outfile)
        ka = tal.keys()[0]
        this = tal[ka]['asis']
        it_pair = itertools.izip(this.seq_a, this.seq_b)

        this_seq = []
        while True:
            try:
                p = it_pair.next()
            except StopIteration:
                break
            if p is None:
                break
            if p[1] == '-':
                assert p[0] != '-', 'gap-gap?'
                this_seq.append(p[0])
            elif p[0] != '-':
                this_seq.append(p[1])
        ws = ''.join(this_seq)

        return ws

Ejemplo n.º 3

0

Mostrar archivo

Archivo: check_outliers.py Proyecto: ozagordi/old-svn-sandbox

def find_best_split(seq):
    '''
    '''
    import heapq
    import operator
    l = len(seq)
    low_lim = int(0.25 * l)
    up_lim = int(0.76 * l)
    step = int(0.25 * l)
    ref_genome = 'all_clones.fas'
    best_score = 0
    best_split = 0
    best_gaps = 0

    for split in range(low_lim, up_lim, step):
        s1 = seq.seq[:low_lim]
        s2 = seq.seq[low_lim:]
        h1 = open('tmp1.fas', 'w')
        h1.write('>%s_1\n' % seq.id.split('#')[0])
        h1.write(s1.tostring() + '\n')
        h1.close()

        h2 = open('tmp2.fas', 'w')
        h2.write('>%s_1\n' % seq.id.split('#')[0])
        h2.write(s2.tostring() + '\n')
        h2.close()

        Alignment.needle_align('tmp1.fas', ref_genome, 'tmp1.needle')
        alset_1 = Alignment.alignfile2set(['tmp1.needle'], 'split_1', 6.0, 3.0)
        os.unlink('tmp1.needle')

        Alignment.needle_align('tmp2.fas', ref_genome, 'tmp2.needle')
        alset_2 = Alignment.alignfile2set(['tmp2.needle'], 'split_2', 6.0, 3.0)
        os.unlink('tmp2.needle')

        k1 = alset_1.keys()[0]
        l1 = [(s[0], s[1].score) for s in alset_1[k1].iteritems()]
        best_1 = heapq.nlargest(2, iter(l1), operator.itemgetter(1))

        k2 = alset_2.keys()[0]
        l2 = [(s[0], s[1].score) for s in alset_2[k2].iteritems()]
        best_2 = heapq.nlargest(2, iter(l2), operator.itemgetter(1))

        if best_1[0][1] + best_2[0][1] >= best_score:
            best_score = best_1[0][1] + best_2[0][1]
            best_split = split
            clones = best_1[0][0], best_2[0][0]
            alset_1[k1][clones[0]].summary()
            alset_2[k2][clones[1]].summary()
            best_gaps = alset_1[k1][clones[0]].int_gaps + alset_2[k2][
                clones[1]].int_gaps
            al_start_1, al_stop_1 = alset_1[k1][clones[0]].start, alset_1[k1][
                clones[0]].stop
            al_start_2, al_stop_2 = alset_2[k2][clones[1]].start, alset_2[k2][
                clones[1]].stop

    del alset_1
    del alset_2
    return best_score, best_split, best_gaps, clones, al_start_1, al_stop_1, al_start_2, al_stop_2

Ejemplo n.º 4

0

Mostrar archivo

Archivo: just_error.py Proyecto: ozagordi/old-svn-sandbox

def find_closest_here(reads_file):
    '''
    The diff_thresh has been set to 0.025 because even when aligning error-free reads
    to the original haplotypes, the distribution of differences of the best 2
    identities goes from 0.023 to 0.091 (~9%)
    '''
    from pythonlib import Alignment
    import tempfile
    import subprocess
    import heapq
    import operator

    # diff_thresh = 0.025
    # abs_thresh = 0.85

    ref_file = 'ref.fasta'
    out = tempfile.NamedTemporaryFile()
    outname = out.name

    cmline = 'needle -asequence %s -bsequence %s \
              -gapopen 6.0 -gapextend 3.0 -auto -adesshow3 -out %s -aformat3 markx10' \
        % (ref_file, reads_file, outname)
    subprocess.call(cmline, shell=True)
    dd = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False)
    kh = dd.keys()[0]
    d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False)[kh]
    out.close()
    this = []
    mm = []
    ident_2 = []
    ig = []
    for k, v in d.items():
        v.summary()
        ig.append(v.int_gaps)
        this.append(float(v.ident) / (v.stop - v.start + 1))
        mm.append(v.mismatch)  #v.stop - v.start + 1 - v.ident
        ident_2.append(float(v.ident) / (v.stop - v.start + 1 - v.int_gaps))
    return ig, this, mm, ident_2

Ejemplo n.º 5

0

Mostrar archivo

def find_closest(hr):
    '''
    The diff_thresh has been set to 0.025 because even when aligning error-free reads
    to the original haplotypes, the distribution of differences of the best 2
    identities goes from 0.023 to 0.091 (~9%)
    '''
    from pythonlib import Alignment
    import tempfile
    import subprocess
    import heapq
    import operator
    
    diff_thresh = 0.0125
    abs_thresh = 0.1

    # ref_file = './ref.fas'
    out = tempfile.NamedTemporaryFile()
    outname = out.name
    hap, ref_file = hr
    cmline = 'needle -asequence asis:\'%s\' -bsequence %s \
              -gapopen 10.0 -gapextend 1.0 -auto -adesshow3 -out %s -aformat3 markx10' \
        % (hap, ref_file, outname)
    subprocess.call(cmline, shell=True)
    d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose = False)['asis']
    out.close()
    this = {}
    mm = {}
    gaps = {}
    for k, v in d.items():
        v.summary()
        this[v.id_b] = float(v.mismatch)/(v.stop - v.start + 1) #float(v.ident)/(v.stop - v.start + 1)
        mm[v.id_b] = v.mismatch # v.stop - v.start + 1 - v.ident
        gaps[v.id_b] = v.int_gaps
        
    best2 = heapq.nsmallest(2, this.items(), operator.itemgetter(1))
    rel_delta = (best2[1][1] - best2[0][1])#/best2[0][1]
    
    if  rel_delta >= diff_thresh and best2[0][1] <= abs_thresh:
        return best2[0][0], best2[0][1], mm[best2[0][0]], gaps[best2[0][0]]
    else:
        return None, gaps[best2[0][0]]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: LocalStructure.py Proyecto: ozagordi/old-svn-sandbox

    def alignedvariants(self, threshold=0.9):
        import subprocess
        import re
        import itertools
        import hashlib
        from Bio.Emboss.Applications import NeedleCommandline
        from pythonlib import Alignment

        files = []
        var_dict = {}
        for i, s in enumerate(self.seq_obj):
            m_obj = re.search('posterior=(.*)\s*ave_reads=(.*)', s.description)
            post, ave_reads = map(float, (m_obj.group(1), m_obj.group(2)))
            if post < threshold or ave_reads < 1.:
                continue
            if post > 1.0:
                print('WARNING: posterior=', post, file=sys.stderr)
            outfile = 'tmp%d.needle' % i
            files.append(outfile)
            needle_cline = NeedleCommandline(asequence='asis:%s' % self.ref, bsequence='asis:%s' % s.seq.tostring().strip('-'), \
                                   outfile=outfile, gapopen=10.0, gapextend=0.5, aformat='markx10')
            needle_cline.auto = True

            try:
                retcode = subprocess.call(str(needle_cline), shell=True)
                if retcode < 0:
                    sys.exit('Child needle was terminated by signal %d' %
                             -retcode)

#               else:
#                   print >> sys.stderr, 'Child needle returned %i' % retcode
            except OSError:
                sys.exit('Execution of needle failed: %s' % ee)
                pass

            tal = Alignment.alignfile2dict([outfile],
                                           'support_seqs%d' % i,
                                           10.0,
                                           0.5,
                                           Verbose=False)
            os.remove(outfile)
            ka = tal.keys()[0]
            this = tal[ka]['asis']
            it_pair = itertools.izip(this.seq_a, this.seq_b)
            #this.summary()
            #start, stop = this.start, this.stop
            #it_pair = itertools.izip(this.seq_a[start-1:stop], this.seq_b[start-1:stop])

            this_seq = []
            while True:
                try:
                    p = it_pair.next()
                except StopIteration:
                    break
                if p is None:
                    break
                if p[1] == '-':
                    assert p[0] != '-', 'gap-gap?'
                    this_seq.append(p[0])
                elif p[0] != '-':
                    this_seq.append(p[1])
            ws = ''.join(this_seq)
            var_dict[ws] = var_dict.get(ws, 0) + ave_reads

        for k, v in var_dict.items():
            ts = Seq(k, IUPAC.unambiguous_dna)
            tsr = SeqRecord(ts, id = hashlib.sha224(k).hexdigest(), \
                            name='Reconstructed local hap')
            tsr.description = 'ave_reads=%f' % v
            self.dna_seqs.append(tsr)
        print('%d haplotypes have support >=%f'\
              % (len(files), threshold), file=sys.stderr)
        return self.dna_seqs

Ejemplo n.º 7

0

Mostrar archivo

def count_codons(haps):

    import pickle
    from Bio.Seq import translate
    from operator import itemgetter
    from pythonlib import Alignment
    from pythonlib import mystats

    latex = False  # print latex table
    count = [{} for i in range(102)]
    oh = open('all.dat', 'w')
    hap_freq = {}
    degeneracy = {}
    mask_mupos = []  #[10, 11, 22, 25, 32, 46, 58, 62, 67, 74, 89]
    mupos = []
    # These sequences are HXB2 proteases
    wt_protease = 'PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'
    wt_protease_nt = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTA\
TTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTA\
TAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT'

    ac_res = map(align_codons, haps)

    protease = wt_protease
    for ar in ac_res:
        start, residues, freq = ar  # start here is human (from 1)
        start -= 1  # start here is pythonic (from 0)
        if start == None and residues == None: continue

        oh.write('%d %s\n' % (round(freq), wt_protease_nt[:start] + residues +
                              wt_protease_nt[len(residues) + start:]))

        if start % 3 == 0:
            read = residues
        elif start % 3 == 1:
            read = residues[2:]
        elif start % 3 == 2:
            read = residues[1:]
        try:
            aa = translate(read)  # Biopython
        except:
            print 'error: read', read
            continue

        if start % 3 == 0:
            start_a = start / 3 + 1
        if start % 3:
            start_a = start / 3 + 2

        stop_a = len(aa) + start_a + 1

        this_hap = str(protease[:start_a - 1] + aa + protease[stop_a - 2:])

        print this_hap.ljust(100), str(freq).ljust(
            8
        )  # this is used for resistance prediction, whole haplotype and reads
        for i, c in enumerate(this_hap):
            count[i + 1][c] = count[i + 1].get(c, 0) + freq
        Alignment.needle_align('asis:%s' % wt_protease, 'asis:%s ' % this_hap,
                               'tmp', 10.0, 0.5)
        d = Alignment.alignfile2dict(['tmp'], 'n', 10.0, 0.5,
                                     Verbose=False)['asis']['asis']
        os.remove('tmp')

        mutations = []

        for i, c in enumerate(zip(d.seq_a, d.seq_b)):
            pos = i + 1
            if '-' in c:
                continue
            if c[0] != c[1]:
                mutations.append(c[0] + str(pos) + c[1])
                if pos not in mask_mupos: mupos.append(pos)
        signature = ', '.join(mutations)
        hap_freq[signature] = hap_freq.get(signature, 0.0) + freq
        degeneracy[signature] = degeneracy.get(signature, 0) + 1
    print ''
    for k, v in hap_freq.items():
        print str(v).ljust(15), ' ', k
    mupos = sorted(mupos)
    spos = {}
    for i, j in enumerate(mupos):
        spos[j] = i

    hf_sorted = sorted(hap_freq.items(), key=itemgetter(1), reverse=True)
    tot_reads = sum([h[1] for h in haps])
    tot_hap = sum(hap_freq.values())

    print 'Tot reads after', tot_reads
    print 'Tot', tot_hap
    print 'Simpson\'s index on amino acid sequences = %f +/- %f' % mystats.Simpson(
        hap_freq.values())
    oh = open('degeneracy.pck', 'w')
    pickle.dump(degeneracy, oh)
    oh.close()

    for c in count:
        ts = sum(c.values())
        for k in c.keys():
            c[k] /= ts
    plot_variation(count)
    if not latex:
        return hf_sorted
    print ''
    print '|c' * (1 + len(spos))
    for i in mupos:
        print '%s%d & ' % (wt_protease[i - 1], i),
    print ''

    return hf_sorted

Ejemplo n.º 8

0

Mostrar archivo

Archivo: check_recombination.py Proyecto: ozagordi/old-svn-sandbox

def main():

    from Bio import SeqIO
    import cPickle
    from multiprocessing import cpu_count
    from pythonlib import Alignment
    from pythonlib import pprocess
    import operator
    import heapq
    import time

    try:
        n_proc = cpu_count()
    except NotImplementedError:
        n_proc = 4

    HPP = cPickle.HIGHEST_PROTOCOL
    min_len = 200
    args = sys.argv

    try:
        reads_file, clones_file = args[1].rstrip('/'), args[2]
    except:
        sys.exit('usage: check_recombination.py reads_file clones_file')

    reads_dict = {}
    reads_dict_1 = {}
    reads_dict_2 = {}
    gaps_dict = {}
    gaps_dict_1 = {}
    gaps_dict_2 = {}

    f_fasta = open(reads_file)
    tmp_seqlist = list(SeqIO.parse(f_fasta, 'fasta'))
    f_fasta.close()
    countreads = len(tmp_seqlist)
    print >> sys.stderr, ' %d reads in the original file '.center(
        60, '-') % countreads

    seqlist = [s for s in tmp_seqlist if len(s) > min_len]

    print >> sys.stderr, ' %d reads are longer than %d '.center(
        60, '-') % (len(seqlist), min_len)

    try:
        t = time.time()
        print >> sys.stderr, ' loading file '.center(60, '-')
        wh = open('%s-check-reads_total.pck' %
                  reads_file.replace('.', 'U').replace('/', '-'))
        al_set_total = cPickle.load(wh)
        wh.close()

        print >> sys.stderr, ' loading file '.center(60, '-')
        wh = open('%s-check-reads_1.pck' %
                  reads_file.replace('.', 'U').replace('/', '-'))
        al_set_1 = cPickle.load(wh)
        wh.close()

        print >> sys.stderr, ' loading file '.center(60, '-')
        wh = open('%s-check-reads_2.pck' %
                  reads_file.replace('.', 'U').replace('/', '-'))
        al_set_2 = cPickle.load(wh)
        wh.close()
        print >> sys.stderr, ' pickle objects loaded in %d seconds '.center(
            60, '-') % (time.time() - t)

    except:
        print >> sys.stderr, ' pickle objects not found, aligning '.center(
            60, '-')
        # reads are considered already aligned
        f_fasta_forward_filename = 'tmp_reads.fas'
        f_fasta_forward = open(f_fasta_forward_filename, 'w')
        SeqIO.write(seqlist, f_fasta_forward, 'fasta')
        f_fasta_forward.close()

        # split in 2, first segment
        f_fasta = open(f_fasta_forward_filename)
        tmp = list(SeqIO.parse(f_fasta, 'fasta'))
        f_fasta.close()
        for seq in tmp:
            l = len(seq)
            middle = int(float(l) / 2)
            seq.seq = seq.seq[:middle]
        out_file = 'tmp_reads_1.fas'
        f_fasta_forward = open(out_file, 'w')
        SeqIO.write(tmp, f_fasta_forward, 'fasta')
        f_fasta_forward.close()
        del tmp

        # split in 2, second segment
        f_fasta = open(f_fasta_forward_filename)
        tmp = list(SeqIO.parse(f_fasta, 'fasta'))
        f_fasta.close()
        for seq in tmp:
            l = len(seq)
            middle = int(float(l) / 2)
            seq.seq = seq.seq[middle:]
        out_file = 'tmp_reads_2.fas'
        f_fasta_forward = open(out_file, 'w')
        SeqIO.write(tmp, f_fasta_forward, 'fasta')
        f_fasta_forward.close()
        del tmp

        # clones
        hc = open(clones_file)
        clones_list = list((SeqIO.parse(hc, 'fasta')))

        i = 0
        tmp_files = []
        for c in clones_list:
            print >> sys.stderr, ' clone %s '.center(60, '-') % c.id
            tfn = 'tmp%d.fas' % i
            tmp_files.append(tfn)
            th = open(tfn, 'w')
            th.write('>%s\n' % c.id)
            th.write('%s' % c.seq.tostring())
            th.close()
            i += 1
        # parallelism
        queue = pprocess.Queue(limit=n_proc)

        ral_par = queue.manage(pprocess.MakeParallel(Alignment.needle_align))
        for tf in tmp_files:
            # align total
            ral_par(tf, 'tmp_reads.fas', tf.split('.')[0] + '-total.needle')

        for tf in tmp_files:
            # align total
            ral_par(tf, 'tmp_reads_1.fas', tf.split('.')[0] + '-1.needle')

        for tf in tmp_files:
            # align total
            ral_par(tf, 'tmp_reads_2.fas', tf.split('.')[0] + '-2.needle')

        for res in queue:
            if Verbose:
                print >> sys.stderr, res[0], res[1]

        # alignment with whole reads
        files = [
            f for f in os.listdir('./')
            if f.startswith('tmp') and f.endswith('-total.needle')
        ]
        al_set_total = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0)

        wh = open(
            '%s-check-reads_total.pck' %
            reads_file.replace('.', 'U').replace('/', '-'), 'w')
        cPickle.dump(al_set_total, wh, HPP)
        wh.close()

        # alignment with first half
        files = [
            f for f in os.listdir('./')
            if f.startswith('tmp') and f.endswith('-1.needle')
        ]
        al_set_1 = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0)

        wh = open(
            '%s-check-reads_1.pck' %
            reads_file.replace('.', 'U').replace('/', '-'), 'w')
        cPickle.dump(al_set_1, wh, HPP)
        wh.close()

        # alignment with second half
        files = [
            f for f in os.listdir('./')
            if f.startswith('tmp') and f.endswith('-2.needle')
        ]
        al_set_2 = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0)

        wh = open(
            '%s-check-reads_2.pck' %
            reads_file.replace('.', 'U').replace('/', '-'), 'w')
        cPickle.dump(al_set_2, wh, HPP)
        wh.close()

        # except ends

    count = 0
    for i in al_set_total:
        for j in al_set_total[i]:
            count += 1

    ial_set_total = invert_keys(al_set_total)
    ial_set_1 = invert_keys(al_set_1)
    ial_set_2 = invert_keys(al_set_2)

    del al_set_total
    del al_set_1
    del al_set_2

    r_keys = ial_set_total.keys()
    lost = len(seqlist) - len(r_keys)
    assert lost == 0, 'lost' + str(lost) + 'reads'

    delta = []
    ambiguous = 0
    total_delta = []
    amb_delta = []
    tot_score = []
    sum_score = []
    tot_score_amb = []
    sum_score_amb = []
    outliers = []
    best_out = {}
    thresh_inc = 0.05
    print >> sys.stderr, 'Total reads', len(r_keys)

    skewness = []
    skewness_amb = []

    for k in ial_set_total:
        total = ial_set_total[k]
        s1 = ial_set_1[k]
        s2 = ial_set_2[k]

        l_tot = [(s[0], s[1].score) for s in total.iteritems()]
        best2_total = heapq.nlargest(2, iter(l_tot), operator.itemgetter(1))

        l_1 = [(s[0], s[1].score) for s in s1.iteritems()]
        best2_s1 = heapq.nlargest(2, iter(l_1), operator.itemgetter(1))

        l_2 = [(s[0], s[1].score) for s in s2.iteritems()]
        best2_s2 = heapq.nlargest(2, iter(l_2), operator.itemgetter(1))

        clone_t = best2_total[0][0]
        clone_1 = best2_s1[0][0]
        clone_2 = best2_s2[0][0]

        ial_set_total[k][clone_t].summary()
        ial_set_1[k][clone_1].summary()
        ial_set_2[k][clone_2].summary()

        len_t = ial_set_total[k][clone_t].stop - ial_set_total[k][
            clone_t].start + 1
        len_1 = ial_set_1[k][clone_1].stop - ial_set_1[k][clone_1].start + 1
        len_2 = ial_set_2[k][clone_2].stop - ial_set_2[k][clone_2].start + 1

        bt = best2_total[0][1] / len_t
        b1 = best2_s1[0][1] / len_1
        b2 = best2_s2[0][1] / len_2

        relative_gain = (b1 + b2 - 2 * bt) / (b1 + b2)

        # if 0.4 < relative_gain and relative_gain < 0.8:
        if abs(len_t - len_1 - len_2) > 5000:
            print best2_total
            print best2_s1
            print best2_s2

            print len_t
            print len_1
            print len_2
            print ial_set_total[k][clone_t].seq_a
            print ial_set_total[k][clone_t].seq_b

            print ial_set_1[k][clone_1].seq_a
            print ial_set_1[k][clone_1].seq_b

            print ial_set_2[k][clone_2].seq_a
            print ial_set_2[k][clone_2].seq_b
            sys.exit()

        if best2_total[0][0] != best2_s1[0][0] or best2_total[0][
                0] != best2_s2[0][0]:
            amb_delta.append(relative_gain)
            skewness_amb.append(abs(b1 - b2) / (bt))
            tot_score_amb.append(bt)
            sum_score_amb.append((b1 + b2) / 2)
            ambiguous += 1
            if relative_gain > 0.05 and b1 > 4.5 and b2 > 4.5:
                tk = k  #.split('#')[0]
                outliers.append(tk)
                best_out[tk] = [
                    best2_total[0][0], best2_s1[0][0], best2_s2[0][0]
                ]
        else:
            total_delta.append(relative_gain)
            skewness.append(abs(b1 - b2) / (bt))
            tot_score.append(bt)
            sum_score.append((b1 + b2) / 2)

    #    print >> sys.stderr, discarded, 'reads had two matches'
    print >> sys.stderr, ambiguous, 'potentially ambiguous'
    print >> sys.stderr, len(outliers), 'outliers'
    # write the outliers
    handle = open(reads_file)
    tmp_dict = SeqIO.to_dict(SeqIO.parse(handle, 'fasta'))
    reads_dict = {}
    for k in tmp_dict.keys():
        k1 = k.split('#')[0]
        reads_dict[k1] = tmp_dict[k]
    out_list = [reads_dict[r] for r in outliers]
    out_handle = open('outliers.fas', 'w')
    SeqIO.write(out_list, out_handle, 'fasta')
    out_handle.close()
    #    for k in best_out:
    #        print >> sys.stderr, k, best_out[k]
    plot_amb_hist(total_delta, amb_delta)