コード例 #1
0
    def evaluate(self, true_line, inf_line, simglfo=None):
        if len(inf_line['unique_ids']) > 1:
            raise Exception('mutli-seq lines not yet handled')
        iseq = 0

        def addval(col, simval, infval):
            if col[2:] == '_insertion':  # stored as the actual inserted bases
                simval = len(simval)
                infval = len(infval)
            diff = infval - simval
            if diff not in self.values[col]:
                self.values[col][diff] = 0
            self.values[col][diff] += 1

        if indelutils.has_indels(true_line['indelfos'][iseq]) or indelutils.has_indels(inf_line['indelfos'][iseq]):
            simlen = indelutils.net_length(true_line['indelfos'][iseq])
            inflen = indelutils.net_length(inf_line['indelfos'][iseq])
            addval('shm_indel_length', simlen, inflen)
            if simlen != inflen:  # this is probably because the simulated shm indel was within the cdr3, so we attempt to fix it by switching the sim line to non-reversed
                print '    %s true and inferred shm net indel lengths different, so skipping rest of performance evaluation' % ' '.join(inf_line['unique_ids'])
                # note that you can't really evaluate the rest of the performance vars in a particularly meaningful when the indel info is different (like I tried to do below) since you have to decide how to assign the indel'd bases (like, is it correct to assign the indel'd bases to a deletion? or to an insertion? or to the j?)
                return
                # true_line = copy.deepcopy(true_line)
                # utils.remove_all_implicit_info(true_line)
                # true_line['indelfos'][iseq] = indelutils.get_empty_indel()
                # true_line['seqs'][iseq] = true_line['input_seqs'][iseq]
                # utils.add_implicit_info(simglfo, true_line)

        mutfo = {lt : {mt : {} for mt in ['freq', 'total']} for lt in ['sim', 'inf']}
        for rstr in plotconfig.rstrings:
            if rstr == '':  # these are already in the <line>s, so may as well not recalculate
                mutfo['sim']['freq'][rstr], mutfo['sim']['total'][rstr] = true_line['mut_freqs'][iseq], true_line['n_mutations'][iseq]
                mutfo['inf']['freq'][rstr], mutfo['inf']['total'][rstr] = inf_line['mut_freqs'][iseq], inf_line['n_mutations'][iseq]
            else:
                mutfo['sim']['freq'][rstr], mutfo['sim']['total'][rstr] = utils.get_mutation_rate_and_n_muted(true_line, iseq=iseq, restrict_to_region=rstr.rstrip('_'))
                mutfo['inf']['freq'][rstr], mutfo['inf']['total'][rstr] = utils.get_mutation_rate_and_n_muted(inf_line, iseq=iseq, restrict_to_region=rstr.rstrip('_'))

        for col in plotconfig.gene_usage_columns:
            self.set_bool_column(true_line, inf_line, col, mutfo['sim']['freq'][''])  # this also sets the fraction-correct-vs-mute-freq hists

        for column in plotconfig.int_columns:
            addval(column, true_line[column], inf_line[column])

        for rstr in plotconfig.rstrings:
            addval(rstr + 'hamming_to_true_naive', 0, self.hamming_to_true_naive(true_line, inf_line, restrict_to_region=rstr.rstrip('_')))
            addval(rstr + 'muted_bases', mutfo['sim']['total'][rstr], mutfo['inf']['total'][rstr])

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)

        self.hists['mute_freqs'].fill(mutfo['inf']['freq'][''] - mutfo['sim']['freq'][''])  # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
コード例 #2
0
ファイル: process_partis.py プロジェクト: matsengrp/cft
def check_seed_for_indels(cluster_line, seed_id, partition_file):
    iseq_seed = cluster_line["unique_ids"].index(seed_id)
    if indelutils.has_indels(cluster_line["indelfos"][iseq_seed]):
        print(indelutils.get_dbg_str(cluster_line["indelfos"][iseq_seed]))
        raise Exception(
            "indel in seed sequence {}. Options are 1. Look at the annotation for this cluster and find the indel in the seed. Rerun process_partis.py with --match-indel-in-uid <uid-of-seq-containing-indel-of-interest> to process only sequences containing that specific indel for further analysis of the indel 2. Run with --ignore-seed-indels. PS check out {}"
            .format(seed_id, partition_file))
コード例 #3
0
    def setline(self, irandom=None):  # don't access <self.line> directly
        if self.line is not None:
            return self.line

        line = {}
        for region in utils.regions:
            line[region + '_gene'] = self.genes[region]
        for boundary in utils.boundaries:
            line[boundary + '_insertion'] = self.insertions[boundary]
        for boundary in utils.effective_boundaries:
            line[boundary + '_insertion'] = ''  # NOTE 'fv' and 'jf' insertions are hereby hardcoded to zero (I'm just writing this here to make it easily searchable -- I don't remember why it's set up that way)
        for erosion in utils.real_erosions:
            line[erosion + '_del'] = self.erosions[erosion]
        for erosion in utils.effective_erosions:
            line[erosion + '_del'] = self.effective_erosions[erosion]
        line['input_seqs'] = self.final_seqs
        line['indelfos'] = self.indelfos
        line['seqs'] = [self.indelfos[iseq]['reversed_seq'] if indelutils.has_indels(self.indelfos[iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs']))]
        self.set_ids(line, irandom=irandom)
        treeutils.translate_labels(self.tree, zip(self.leaf_names, line['unique_ids']))  # ordering in <self.leaf_names> is set in recombinator.add_mutants()
        line['affinities'] = [None for _ in line['unique_ids']]
        line['tree'] = self.tree.as_string(schema='newick')

        utils.add_implicit_info(self.glfo, line)

        self.line = line
コード例 #4
0
ファイル: event.py プロジェクト: wangdi2014/partis
    def setline(self, irandom=None):  # don't access <self.line> directly
        if self.line is not None:
            return self.line

        line = {}
        for region in utils.regions:
            line[region + '_gene'] = self.genes[region]
        for boundary in utils.boundaries:
            line[boundary + '_insertion'] = self.insertions[boundary]
        for boundary in utils.effective_boundaries:
            line[boundary + '_insertion'] = ''
        for erosion in utils.real_erosions:
            line[erosion + '_del'] = self.erosions[erosion]
        for erosion in utils.effective_erosions:
            line[erosion + '_del'] = self.effective_erosions[erosion]
        line['input_seqs'] = self.final_seqs
        line['indelfos'] = self.indelfos
        line['seqs'] = [
            line['indelfos'][iseq]['reversed_seq'] if indelutils.has_indels(
                line['indelfos'][iseq]) else line['input_seqs'][iseq]
            for iseq in range(len(line['input_seqs']))
        ]
        self.set_ids(line, irandom)

        utils.add_implicit_info(self.glfo, line)

        self.line = line
コード例 #5
0
ファイル: process_partis.py プロジェクト: matsengrp/cft
def get_cluster_meta_dict(cluster_line, seed_id, args):
    has_indels = any([
        indelutils.has_indels(cluster_line["indelfos"][iseq])
        for iseq in range(len(cluster_line["input_seqs"]))
    ])
    if not args.indel_reversed_seqs and not has_indels:
        warnings.warn(
            "{}: --indel-reversed-seqs was not passed and there are no indels. If running this script from CFT, this is probably because CFT was run with --preserve-indels and there are no indels in this cluster. It will get aligned anyway."
            .format(utils.color("red", "warning")))
    return {
        "sequences": get_cluster_seqs_dict(cluster_line, seed_id, args),
        "cdr3_start": cluster_line["codon_positions"]["v"],
        "has_seed": seed_id in cluster_line["unique_ids"],
        "mean_mut_freq": numpy.mean(cluster_line["mut_freqs"]),
        "seed_id": seed_id,
        "match_indel_in_uid": args.match_indel_in_uid is not None,
        "has_indels": has_indels,
        "indels_reversed": has_indels and args.indel_reversed_seqs,
    }
コード例 #6
0
def print_seq_in_reco_event(original_line,
                            iseq,
                            extra_str='',
                            label='',
                            one_line=False,
                            seed_uid=None,
                            check_line_integrity=False):
    """
    Print ascii summary of recombination event and mutation.
    If <one_line>, then skip the germline lines, and only print the final_seq line.
    """
    line = original_line
    if check_line_integrity:  # it's very important not to modify <line> -- this lets you verify that you aren't
        line = copy.deepcopy(
            original_line)  # copy that we can modify without changing <line>

    delstrs = {
        d: '.' * line[d + '_del']
        for d in utils.all_erosions
    }  # NOTE len(delstrs[<del>]) is not in general the same as len(line[<del>_del])
    if len(
            delstrs['v_5p']
    ) > 50:  # don't print a million dots if left-side v deletion is really big
        delstrs['v_5p'] = '.%d.' % len(delstrs['v_5p'])

    # if there isn't enough space for dots in the vj line, we add some dashes to everybody so things fit (rare in heavy chain rearrangements, but pretty common in light chain)
    d_plus_inserts_length = len(line['vd_insertion'] + line['d_gl_seq'] +
                                line['dj_insertion'])
    if line['v_3p_del'] + line[
            'j_5p_del'] > d_plus_inserts_length:  # if dots for v and j interior deletions will be longer than <d_plus_inserts_length>
        delstrs['v_3p'] = '.%d.' % line['v_3p_del']
        delstrs['j_5p'] = '.%d.' % line['j_5p_del']
        gapstr = '-' * (len(delstrs['v_3p'] + delstrs['j_5p']) -
                        d_plus_inserts_length)
        gap_insert_point = len(
            line['fv_insertion'] + delstrs['v_5p'] + line['v_gl_seq']
        )  # it doesn't really matter exactly where we put the blue dashes, as long as it's the same place in all four lines, but this is a good spot
        extra_space_because_of_fixed_nospace = max(
            0, d_plus_inserts_length - len(delstrs['v_3p'] + delstrs['j_5p'])
        )  # if shortening the <delstrs> already over-compensated for the lack of space (i.e., if the number of dashes necessary is zero), then we need to add some dots to the vj line below
    else:
        gapstr = ''
        gap_insert_point = None
        extra_space_because_of_fixed_nospace = 0

    eroded_seqs_dots = {
        r: delstrs[r + '_5p'] + line[r + '_gl_seq'] + delstrs[r + '_3p']
        for r in utils.regions
    }

    # build the three germline lines
    insert_line = ' ' * (len(line['fv_insertion']) + line['lengths']['v'] + len(delstrs['v_5p'])) \
                  + line['vd_insertion'] + ' ' * line['lengths']['d'] + line['dj_insertion'] \
                  + ' ' * (line['lengths']['j'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_d_start = len(line['fv_insertion']) + line['lengths']['v'] + len(
        line['vd_insertion']) - line['d_5p_del']
    germline_d_end = germline_d_start + line['d_5p_del'] + line['lengths'][
        'd'] + line['d_3p_del']
    d_line = ' ' * (germline_d_start + len(delstrs['v_5p'])) \
             + eroded_seqs_dots['d'] \
             + ' ' * (len(line['j_gl_seq']) + len(line['dj_insertion']) - line['d_3p_del'] + line['j_3p_del'] + len(line['jf_insertion']))
    germline_v_end = len(line['fv_insertion']) + len(line['v_gl_seq']) + line[
        'v_3p_del'] - 1  # position in the query sequence at which we find the last base of the v match. NOTE we subtract off the v_5p_del because we're *not* adding dots for that deletion (it's just too long)
    germline_j_start = germline_d_end + 1 - line['d_3p_del'] + len(
        line['dj_insertion']) - line['j_5p_del']
    vj_line = ' ' * len(line['fv_insertion']) + eroded_seqs_dots['v'] + '.' * extra_space_because_of_fixed_nospace \
              + ' ' * (germline_j_start - germline_v_end - 2) + eroded_seqs_dots['j'] + ' ' * len(line['jf_insertion'])
    # and the query line
    qrseq_line = ' ' * len(
        delstrs['v_5p']) + line['seqs'][iseq] + ' ' * line['j_3p_del']

    outstrs = [insert_line, d_line, vj_line, qrseq_line]
    check_outsr_lengths(
        line, outstrs, fix=True
    )  # I think the only way they can be different is if the d right side erosion is so long that it hangs over the right side of the j

    if gap_insert_point is not None:
        for istr in [
                0, 1, 3
        ]:  # everybody except the vj line, which already has the modified interior delstrs above
            outstrs[
                istr] = outstrs[istr][:gap_insert_point] + gapstr + outstrs[
                    istr][gap_insert_point:]

    check_outsr_lengths(line, outstrs, fix=True)

    colors = [[[] for _ in range(len(ostr))] for ostr in outstrs]
    if indelutils.has_indels(line['indelfos'][iseq]):
        # outstrs, colors = old_indel_shenanigans(line, iseq, outstrs, colors)
        outstrs, colors = indel_shenanigans(line, iseq, outstrs, colors)
    outstrs = add_colors(outstrs, colors, line)

    suffixes = [
        'insert%s\n' %
        ('s' if utils.has_d_gene(utils.get_locus(line['v_gene'])) else ''),
        '%s\n' % (utils.color_gene(line['d_gene'])),
        '%s %s\n' %
        (utils.color_gene(line['v_gene']), utils.color_gene(line['j_gene'])),
        '%s   %4.2f mut  %s\n' %
        (get_uid_str(line, iseq, seed_uid), line['mut_freqs'][iseq],
         utils.color('red', utils.is_functional_dbg_str(line, iseq)))
    ]
    outstrs = [
        '%s%s   %s' % (extra_str, ostr, suf)
        for ostr, suf in zip(outstrs, suffixes)
    ]

    if label != '':  # this doesn't really work if the edge of the removed string is the middle of a color code... but oh well, it doesn't really happen any more since I shortened the kbound label from waterer.py
        offset = max(
            0,
            len(extra_str) -
            2)  # skootch <label> this many positions leftward into <extra_str>
        removed_str = outstrs[0][offset:offset +
                                 utils.len_excluding_colors(label)]
        outstrs[0] = outstrs[0][:offset] + label + outstrs[0][
            utils.len_excluding_colors(label) +
            offset:]  # NOTE this *replaces* the bases in <extra_str> with <label>, which is only fine if they're spaces
        if removed_str.strip() != '':
            print '%s%s (covered by label \'%s\')' % (
                ' ' * offset, utils.color('red', removed_str), label)

    if one_line:
        outstrs = outstrs[-1:]  # remove all except the query seq line
    elif not utils.has_d_gene(utils.get_locus(line['v_gene'])):
        outstrs.pop(1)  # remove the d germline line

    print ''.join(outstrs),

    if check_line_integrity:
        if set(line.keys()) != set(original_line.keys()):
            raise Exception('ack 1')
        for k in line:
            if line[k] != original_line[k]:
                print 'key %s differs:\n  %s\n  %s ' % (k, line[k],
                                                        original_line[k])
                raise Exception('')
コード例 #7
0
ファイル: bcr-phylo-run.py プロジェクト: linchunjen/partis
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
コード例 #8
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
コード例 #9
0
ファイル: bcr-phylo-run.py プロジェクト: Xiujia-Yang/partis
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
コード例 #10
0
ファイル: bcr-phylo-run.py プロジェクト: AkselObdrup/partis
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line