Exemple #1
0
def get_dbg_str(indelfo):
    if len(indelfo['qr_gap_seq']) != len(indelfo['gl_gap_seq']):
        print indelfo['qr_gap_seq']
        print indelfo['gl_gap_seq']
        raise Exception('different length qr and gl gap seqs (see previous lines)')
    qrprintstr, glprintstr = [], []
    for ich in range(len(indelfo['qr_gap_seq'])):
        qrb, glb = indelfo['qr_gap_seq'][ich], indelfo['gl_gap_seq'][ich]
        qrcolor, glcolor = None, None
        if qrb in utils.gap_chars or glb in utils.gap_chars:
            qrcolor = 'light_blue'
            glcolor = 'light_blue'
        elif qrb in utils.ambiguous_bases:
            qrcolor = 'light_blue'
        elif glb in utils.ambiguous_bases:
            glcolor = 'light_blue'
        elif qrb != glb:
            qrcolor = 'red'
        qrprintstr.append(utils.color(qrcolor, qrb if qrb not in utils.gap_chars else '*'))  # change it to a start just cause that's what it originally was... at some point should switch to just leaving it whatever gap char it was
        glprintstr.append(utils.color(glcolor, glb if glb not in utils.gap_chars else '*'))
    qrprintstr = ''.join(qrprintstr)
    glprintstr = ''.join(glprintstr)

    gene_str = ''
    gwidth = str(len('query'))
    if 'v' in indelfo['genes']:
        gene_str = utils.color_gene(indelfo['genes']['v'], width=int(gwidth), leftpad=True)
        gwidth = str(utils.len_excluding_colors(gene_str))
    dj_gene_str = ' '.join([utils.color_gene(indelfo['genes'][r]) for r in 'dj' if r in indelfo['genes']])
    dbg_str_list = [('  %' + gwidth + 's  %s  %s') % (gene_str, glprintstr, dj_gene_str),
                    ('  %' + gwidth + 's  %s') % ('query', qrprintstr)]
    for idl in indelfo['indels']:
        dbg_str_list.append('%10s: %d base%s at %d (%s)' % (idl['type'], idl['len'], utils.plural(idl['len']), idl['pos'], idl['seqstr']))
    return '\n'.join(dbg_str_list)
Exemple #2
0
    def print_partitions(self,
                         reco_info=None,
                         extrastr='',
                         abbreviate=True,
                         print_header=True,
                         n_to_print=None,
                         calc_missing_values='none',
                         highlight_cluster_indices=None,
                         print_partition_indices=False,
                         ipart_center=None,
                         sort_by_size=True):
        assert calc_missing_values in ['none', 'all', 'best']
        if reco_info is not None and calc_missing_values == 'all':
            self.calculate_missing_values(reco_info)

        if print_header:
            print '    %s%7s %10s   %-7s %s%5s  %4s' % (
                ' ' * utils.len_excluding_colors(extrastr), '', 'logprob',
                'delta', 'index  ' if print_partition_indices else '',
                'clusters', 'n_procs'),
            if reco_info is not None or self.we_have_a_ccf:
                print ' %5s %5s' % ('purity', 'completeness'),
            print ''

        for ip in self.get_surrounding_partitions(n_to_print,
                                                  i_center=ipart_center):
            if reco_info is not None and calc_missing_values == 'best' and ip == self.i_best:
                self.calculate_missing_values(reco_info, only_ip=ip)
            mark = '      '
            if ip == self.i_best:
                mark = 'best  '
            if ip == self.i_best_minus_x:
                mark = mark[:-2] + '* '
            if mark.count(' ') < len(mark):
                mark = utils.color('yellow', mark)
            right_extrastr = '' if self.n_seqs(
            ) < 200 else mark  # if line is going to be really long, put the yellow stuff also on the right side
            self.print_partition(
                ip,
                reco_info,
                extrastr=extrastr + mark,
                abbreviate=abbreviate,
                highlight_cluster_indices=highlight_cluster_indices,
                print_partition_indices=print_partition_indices,
                right_extrastr=right_extrastr,
                sort_by_size=sort_by_size)
Exemple #3
0
    def run(self, args):
        if not args.dry_run:
            open(self.logfname, 'w').close()

        for name, info in self.tests.items():
            if args.quick and name not in self.quick_tests:
                continue

            self.prepare_to_run(args, name, info)

            action = info['action']
            cmd_str = info['bin'] + ' ' + action
            cmd_str += ' ' + ' '.join(info['extras'] + self.common_extras)
            if name == 'simulate':
                cmd_str += ' --outfname ' + self.infnames['new']['simu']
                cmd_str += ' --indel-frequency 0.01 --indel-location v'
            elif 'cache-parameters-' not in name:
                cmd_str += ' --outfname ' + self.dirs[
                    'new'] + '/' + name + '.csv'

            logstr = '%s   %s' % (utils.color(
                'green', name, width=30, padside='right'), cmd_str)
            print logstr if utils.len_excluding_colors(
                logstr
            ) < args.print_width else logstr[:args.print_width] + '[...]'
            if args.dry_run:
                continue
            logfile = open(self.logfname, 'a')
            logfile.write(logstr + '\n')
            logfile.close()
            start = time.time()
            try:
                check_call(cmd_str + ' 1>>' + self.logfname + ' 2>>' +
                           self.logfname,
                           shell=True)
            except CalledProcessError, err:
                # print err  # this just says it exited with code != 0
                print '  log tail:'
                print utils.pad_lines(check_output(['tail', self.logfname]))
                sys.exit(1)  # raise Exception('exited with error')
            self.run_times[name] = time.time() - start  # seconds
Exemple #4
0
def print_seq_in_reco_event(original_line,
                            iseq,
                            extra_str='',
                            label='',
                            one_line=False,
                            seed_uid=None,
                            check_line_integrity=False):
    """
    Print ascii summary of recombination event and mutation.
    If <one_line>, then skip the germline lines, and only print the final_seq line.
    """
    line = original_line
    if check_line_integrity:  # it's very important not to modify <line> -- this lets you verify that you aren't
        line = copy.deepcopy(
            original_line)  # copy that we can modify without changing <line>

    delstrs = {
        d: '.' * line[d + '_del']
        for d in utils.all_erosions
    }  # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del])
    if len(
            delstrs['v_5p']
    ) > 50:  # don't print a million dots if left-side v deletion is really big
        delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p'])

    # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain)
    d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] +
                                line['dj_insertion'])
    if line['v_3p_del'] + line[
            'j_5p_del'] > d_plus_inserts_length:  # if dots for v and j interior deletions will be longer than <d_plus_inserts_length>
        delstrs['v_3p'] = '.%d.' % line['v_3p_del']
        delstrs['j_5p'] = '.%d.' % line['j_5p_del']
        gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) -
                        d_plus_inserts_length)
        gap_insert_point = len(
            line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq']
        )  # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot
        extra_space_because_of_fixed_nospace = max(
            0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p'])
        )  # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below
    else:
        gapstr = ''
        gap_insert_point = None
        extra_space_because_of_fixed_nospace = 0

    eroded_seqs_dots = {
        r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p']
        for r in utils.regions
    }

    # build the three germline lines
    insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \
                  + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \
                  + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len(
        line['vd_insertion']) - line['d_5p_del']
    germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][
        'd'] + line['d_3p_del']
    d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \
             + eroded_seqs_dots['d'] \
             + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[
        'v_3p_del'] - 1  # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long)
    germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len(
        line['dj_insertion']) - line['j_5p_del']
    vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \
              + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion'])
    # and the query line
    qrseq_line = ' ' * len(
        delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del']

    outstrs = [insert_line, d_line, vj_line, qrseq_line]
    check_outsr_lengths(
        line, outstrs, fix=True
    )  # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j

    if gap_insert_point is not None:
        for istr in [
                0, 1, 3
        ]:  # everybody except the vj line, which already has the modified interior delstrs above
            outstrs[
                istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[
                    istr][gap_insert_point:]

    check_outsr_lengths(line, outstrs, fix=True)

    colors = [[[] for _ in range(len(ostr))] for ostr in outstrs]
    if indelutils.has_indels(line['indelfos'][iseq]):
        # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors)
        outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors)
    outstrs = add_colors(outstrs, colors, line)

    suffixes = [
        'insert%s\n' %
        ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''),
        '%s\n' % (utils.color_gene(line['d_gene'])),
        '%s %s\n' %
        (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])),
        '%s   %4.2f mut  %s\n' %
        (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq],
         utils.color('red', utils.is_functional_dbg_str(line, iseq)))
    ]
    outstrs = [
        '%s%s   %s' % (extra_str, ostr, suf)
        for ostr, suf in zip(outstrs, suffixes)
    ]

    if label != '':  # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py
        offset = max(
            0,
            len(extra_str) -
            2)  # skootch <label> this many positions leftward into <extra_str>
        removed_str = outstrs[0][offset:offset +
                                 utils.len_excluding_colors(label)]
        outstrs[0] = outstrs[0][:offset] + label + outstrs[0][
            utils.len_excluding_colors(label) +
            offset:]  # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces
        if removed_str.strip() != '':
            print '%s%s (covered by label \'%s\')' % (
                ' ' * offset, utils.color('red', removed_str), label)

    if one_line:
        outstrs = outstrs[-1:]  # remove all except the query seq line
    elif not utils.has_d_gene(utils.get_locus(line['v_gene'])):
        outstrs.pop(1)  # remove the d germline line

    print ''.join(outstrs),

    if check_line_integrity:
        if set(line.keys()) != set(original_line.keys()):
            raise Exception('ack 1')
        for k in line:
            if line[k] != original_line[k]:
                print 'key %s differs:\n  %s\n  %s ' % (k, line[k],
                                                        original_line[k])
                raise Exception('')
Exemple #5
0
def read_gex(outdir, min_dprod=0.001, debug=True):
    # barcodes
    barcode_vals = []
    with open('%s/%s' % (outdir, barcodefname)) as bfile:
        for il, line in enumerate(bfile):
            lstrs = line.strip().split()
            icount = int(lstrs.pop(0).strip('[]'))
            assert icount == len(
                barcode_vals
            ) + 1  # <icount> is the R-style (1-based) index of the first element in this line
            barcode_vals += [s.strip('"') for s in lstrs]
    if debug:
        print '    read %d barcodes' % len(barcode_vals)

    # pca values
    rotation_vals = collections.OrderedDict(
    )  # relationship between pca and gene names (map from gene name to list of pca components)
    with open('%s/%s' % (outdir, pcafname)) as pfile:
        pca_comps = None  # names for each pca component (like PC3)
        for il, line in enumerate(pfile):
            if il == 0:
                pca_comps = line.strip().split()
                for ipc, pc in enumerate(pca_comps):
                    assert pc[:2] == 'PC'
                    assert int(pc[2:]) == ipc + 1
                continue
            lstrs = line.strip().split()
            gene = lstrs.pop(0)
            assert len(lstrs) == len(pca_comps)
            rotation_vals[gene] = [float(vstr) for vstr in lstrs]
    if debug:
        print '      %d pca components for %d genes: %s' % (
            len(pca_comps), len(rotation_vals), ' '.join(rotation_vals))

    # umap values
    umap_vals = []  # list of (x, y) umap values for each cell
    with open('%s/%s' % (outdir, umapfname)) as ufile:
        for il, line in enumerate(ufile):
            lstrs = line.strip().split()
            if il == 0:
                assert lstrs == ['[,%d]' % i for i in [1, 2]]
            else:
                icount = int(lstrs.pop(0).strip('[]').rstrip(','))
                assert icount == len(umap_vals) + 1
                umap_vals.append([float(v) for v in lstrs])
    if debug:
        print '      %d umap values' % len(umap_vals)
    assert len(umap_vals) == len(barcode_vals)

    # cluster assignments
    cluster_vals = []
    with open('%s/%s' % (outdir, clusterfname)) as cfile:
        for il, line in enumerate(cfile):
            lstrs = line.strip().split()
            if lstrs[0] != 'Levels:':
                icount = int(lstrs.pop(0).strip('[]'))
                assert icount == len(
                    cluster_vals
                ) + 1  # <icount> is the R-style (1-based) index of the first element in this line
                cluster_vals += [int(c) for c in lstrs]
            else:  # last line lists the clusters (not sure why they're called "levels"
                cluster_ints = [
                    int(c) for c in lstrs[1:]
                ]  # names of the clusters (1-based integer index)
                assert cluster_ints == list(
                    range(min(cluster_ints),
                          max(cluster_ints) + 1))
                assert set(cluster_ints) == set(cluster_vals)
    if debug:
        print '      %d values in %d clusters: %s' % (
            len(cluster_vals), len(cluster_ints), ' '.join(
                str(c) for c in cluster_ints))
    assert len(cluster_vals) == len(barcode_vals)

    # markers for each cluster
    pairwise_cmarkers = {
        '%d-%d' % (c1, c2): []
        for c1, c2 in itertools.permutations(cluster_ints, 2)
    }  # reversing them (1-2 vs 2-1) the values are just the negative of each other if they're both there, but you don't get all the same genes
    summary_cmarkers = {'%d-summary' % c: [] for c in cluster_ints}
    for cname in cluster_ints:
        other_clusters = [c for c in cluster_ints if c != cname]
        with open('%s/%s' % (outdir, markfname(cname))) as cfile:
            reader = csv.DictReader(cfile)
            assert list(reader.fieldnames)[:5] == [
                '', 'Top', 'p.value', 'FDR', 'summary.logFC'
            ]  # summary.logFC is the log-fold change from the comparison with the lowest p-value (not necessarily the min/max log fold change)
            assert list(reader.fieldnames)[5:] == [
                'logFC.%d' % i for i in other_clusters
            ]  # should be a column for each pairwise comparison with another cluster
            for il, line in enumerate(reader):
                gene = line['']
                logfc_vals = {
                    i: float(line['logFC.%d' % i])
                    for i in other_clusters
                }
                summary_cmarkers['%d-summary' % cname].append(
                    (gene, float(line['summary.logFC'])))
                for c2 in logfc_vals:
                    pairwise_cmarkers['%d-%d' % (cname, c2)].append(
                        (gene, logfc_vals[c2]))
    for ckey in pairwise_cmarkers:
        pairwise_cmarkers[ckey] = collections.OrderedDict(
            sorted(pairwise_cmarkers[ckey],
                   key=operator.itemgetter(1),
                   reverse=True))
    for ckey in summary_cmarkers:
        summary_cmarkers[ckey] = collections.OrderedDict(
            sorted(summary_cmarkers[ckey],
                   key=operator.itemgetter(1),
                   reverse=True))

    # reference marker genes
    fabfo, waickfo = read_ref_data()

    print '  interpretation: "this cluster is much more <type>-like than <clusters>, based on relative upregulation of <N genes>"'
    print '        type    any (N genes)   vs. single clusters                                                     gene contributions (sum over clusters)'
    for cname in cluster_ints:
        print '  %s' % utils.color('green', 'cluster %d' % cname)
        for vtype in waickfo:
            clprods = []
            all_contribs = {}
            for ic2, c2 in enumerate([c for c in cluster_ints if c != cname]):
                dprod, gene_contribs = gexdot(
                    waickfo[vtype],
                    pairwise_cmarkers['%d-%d' % (cname, c2)],
                    return_gene_contributions=True,
                    lbstr='%8s %s ' %
                    ((vtype + ':') if ic2 == 0 else '',
                     utils.color('blue', str(c2))))  #, debug=True)
                if dprod < min_dprod:
                    continue
                clprods.append({
                    'c2': c2,
                    'dprod': dprod,
                    'gene_contribs': gene_contribs
                })
                for tg, contr in gene_contribs.items():
                    if tg not in all_contribs:
                        all_contribs[tg] = 0.
                    all_contribs[tg] += gene_contribs[tg]
            clprods = sorted(clprods, key=lambda x: x['dprod'], reverse=True)
            anydprod, anygcontribs = gexdot(
                waickfo[vtype],
                summary_cmarkers['%d-summary' % cname],
                return_gene_contributions=True)  # lbstr=XXX
            sumclprod = {'dprod': anydprod, 'gene_contribs': anygcontribs}
            if debug and len(clprods) > 0:

                def dcol(d):
                    if d['dprod'] > 0.1:
                        return 'red'
                    elif d['dprod'] > 0.01:
                        return 'yellow'
                    else:
                        return None

                def dpstr(d):
                    return utils.color(dcol(d), '%.3f' % d['dprod'])

                def cstr(d):
                    return utils.color('blue', '%d' % d['c2'])

                tmpstr = '  '.join('%s %s' % (cstr(d), dpstr(d))
                                   for d in clprods)
                anystr = ''
                if sumclprod['dprod'] > min_dprod:
                    anystr = '%s (%2d)' % (dpstr(sumclprod),
                                           len(sumclprod['gene_contribs']))
                print '      %s  %-s    %-s  %s' % (
                    utils.color('purple', vtype, width=8),
                    # utils.color('blue', ' '.join('%d'%d['c2'] for d in clprods), width=20, padside='right'),
                    anystr + ' ' * (12 - utils.len_excluding_colors(anystr)),
                    tmpstr + ' ' * (70 - utils.len_excluding_colors(tmpstr)),
                    '  '.join('%s %.1f' % (g.lower(), c)
                              for g, c in sorted(all_contribs.items(),
                                                 key=operator.itemgetter(1),
                                                 reverse=True)),
                )
Exemple #6
0
    def finalize_region(self,
                        region,
                        sorted_gene_counts,
                        annotations=None,
                        debug=False):
        easycounts = {gene: counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])
        class_counts = self.separate_into_classes(region, sorted_gene_counts,
                                                  easycounts)

        genes_to_keep = set()

        if debug:
            print '   %s groups separated by %d snps  (-: same group as previous kept gene)' % (
                utils.color('blue', region), self.n_max_snps[region])
            print '     %-20s       %5s %s        removed genes (snps counts%s)%s%s' % (
                'genes to keep',
                'counts',
                '' if self.simglfo is None else utils.color('blue', 'sim'),
                '' if self.simglfo is None else utils.color(
                    'blue', ' sim counts'),
                '' if self.simglfo is None else
                ('  ' + utils.color('red', 'x:') + ' not in simulation'),
                '' if (annotations is None or self.reco_info is None) else
                ('               %s sim counts/genes for the queries assigned to this kept gene %s'
                 % (utils.color('blue', '['), utils.color('blue', ']'))),
            ),

            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

            def simcountstr(
                gene, ws
            ):  # counts in simulation for <gene> (note that this is _not_ the same as sim_gene_count_str(), since this takes no account of _which_ queries these counts occur in [plus it's coming from the opposite point of view])
                if self.simglfo is None:
                    rstr = ''
                elif gene in self.simglfo['seqs'][utils.get_region(gene)]:
                    rstr = utils.color(
                        'blue', (' %' + ws + 'd') %
                        self.simcounts[utils.get_region(gene)][gene])
                else:
                    rstr = utils.color('red', (' %' + ws + 's') % 'x')
                return rstr

            def sim_gene_count_str(
                kgene
            ):  # figure out simulation genes and counts for the uids assigned to <kgene>
                if annotations is None or self.reco_info is None:
                    return ''
                uids_this_gene = [
                    uid for uid, line in annotations.items()
                    if line[region + '_gene'] == kgene
                ]
                sim_genes = {
                }  # simulation genes for the uids that we assigned to <kgene> (note that self.simcounts doesn't have this per-uid information)
                for uid in uids_this_gene:
                    sgene = self.reco_info[uid][region + '_gene']
                    if sgene not in sim_genes:
                        sim_genes[sgene] = 0
                    sim_genes[sgene] += 1
                sorted_sim_gene_counts = sorted(sim_genes.items(),
                                                key=operator.itemgetter(1),
                                                reverse=True)
                count_str = ' '.join([
                    utils.color('blue' if sg == kgene else 'red', str(c))
                    for sg, c in sorted_sim_gene_counts
                ])
                sgene_str = ' '.join(
                    [utils.color_gene(sg) for sg, _ in sorted_sim_gene_counts])
                return '%s   %s' % (count_str, sgene_str)

        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            kept_this_class = []
            for ig in range(len(gclass)):
                gfo = gclass[ig]

                if float(
                        gfo['counts']
                ) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass  # don't keep it
                elif ig == 0:  # keep the first one from this class
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                elif utils.hamming_distance(
                        gclass[0]['seq'], gclass[ig]['seq']
                ) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif len(
                        kept_this_class
                ) < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class [note: defaults to 1 if looking for new alleles, otherwise 2]
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(
                        gclass[0]['seq'], gfo['seq']
                    )  # only happens if we keep more than one from this class
                    print '\n      %s%-s  %7s%s  %-3s' % (
                        '- ' if ig > 0 else '  ',
                        utils.color_gene(gfo['gene'], width=20),
                        count_str(gfo['counts']), simcountstr(
                            gfo['gene'], '4'), snpstr),
            if debug:
                if len(kept_this_class) == 0:
                    print '\n      %s%-s  %7s%4s  %-3s' % (
                        '  ',
                        utils.color('blue', 'none', width=20,
                                    padside='right'), '-', '', ''),
                removedfo = [
                    gfo for gfo in gclass if gfo['gene'] not in genes_to_keep
                ]
                removed_str = ''
                if len(removedfo) > 0:
                    number_strs = [
                        '(%d %3s%s)' % (gfo['hdist'], count_str(
                            gfo['counts']), simcountstr(gfo['gene'], '1'))
                        for gfo in removedfo
                    ]
                    name_strs = [
                        '%s' % (utils.color_gene(gfo['gene']))
                        for gfo in removedfo
                    ]
                    removed_str = '%s  %s' % (' '.join(number_strs),
                                              ' '.join(name_strs))
                annotation_str = ''
                if (annotations is not None and self.reco_info
                        is not None) and len(kept_this_class) > 0:
                    annotation_str = '%s %s %s' % (utils.color(
                        'blue', '['), sim_gene_count_str(
                            kept_this_class[-1]), utils.color('blue', ']'))
                print '     %s  %s  %s' % (
                    removed_str,
                    (70 - utils.len_excluding_colors(removed_str)) * ' ',
                    annotation_str),
        if debug:
            print ''

        genes_to_remove = set(self.glfo['seqs'][region]) - genes_to_keep

        print '    keeping %d / %d %s gene%s' % (
            len(genes_to_keep), len(self.glfo['seqs'][region]), region,
            utils.plural(len(genes_to_keep)))
        if len(genes_to_keep) == 0:
            print '   would\'ve kept zero genes, instead keeping all of them'
            genes_to_keep = copy.deepcopy(genes_to_remove)
            genes_to_remove.clear()

        if self.simglfo is not None:
            missing_genes = set(self.simglfo['seqs'][region]) - genes_to_keep
            if len(missing_genes) > 0:
                print '    %s %d simulation genes (counts): %s' % (utils.color(
                    'red', 'missing'), len(missing_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(missing_genes)]))
            completely_absent_genes = missing_genes - genes_to_remove
            if len(completely_absent_genes) > 0:
                print '%s %d simulation genes completely absent: %s' % (
                    utils.color('red', 'warning'),
                    len(completely_absent_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(completely_absent_genes)]))

        self.genes_to_keep |= genes_to_keep  # add the ones from _this_ region (rhs) to the ones from all regions (lhs)
        self.genes_to_remove |= genes_to_remove

        self.finalized = True
Exemple #7
0
    g for base in args.bases for g in get_genes(base, args.allele_numbers)
]
if len(genes) == 0:
    raise Exception(
        'couldn\'t find any genes for the specified --bases %s\n  choices:\n    %s'
        % (' '.join(args.bases), ' '.join(
            sorted(set([get_base(g) for g in glfo['seqs'][args.region]])))))
args.other_genes = utils.get_arg_list(args.other_genes)
if args.other_genes is not None:
    genes += args.other_genes

seqstrs = ['' for _ in range(len(genes))]
snpstrs = ['' for _ in range(len(genes))]

gene_str_width = max(
    [utils.len_excluding_colors(utils.color_gene(g)) for g in genes])
codon_positions = glfo[utils.conserved_codons[args.locus][args.region] +
                       '-positions'] if args.region != 'd' else None
max_seq_len = max([len(glfo['seqs'][args.region][g]) for g in genes])

ref_gene = genes[0] if args.ref_allele is None else utils.rejoin_gene(
    args.locus, args.region, utils.primary_version(genes[0]),
    utils.sub_version(genes[0]), args.ref_allele)
if ref_gene != genes[0]:
    genes.remove(ref_gene)
    genes.insert(0, ref_gene)
ref_seq = glfo['seqs'][args.region][ref_gene]
ref_pos = codon_positions[ref_gene]

for igene in range(0, len(genes)):
    gene = genes[igene]