Esempio n. 1
    def setline(self, irandom=None):  # don't access <self.line> directly
        if self.line is not None:
            return self.line

        line = {}
        for region in utils.regions:
            line[region + '_gene'] = self.genes[region]
        for boundary in utils.boundaries:
            line[boundary + '_insertion'] = self.insertions[boundary]
        for boundary in utils.effective_boundaries:
            line[boundary + '_insertion'] = ''  # NOTE 'fv' and 'jf' insertions are hereby hardcoded to zero (I'm just writing this here to make it easily searchable -- I don't remember why it's set up that way)
        for erosion in utils.real_erosions:
            line[erosion + '_del'] = self.erosions[erosion]
        for erosion in utils.effective_erosions:
            line[erosion + '_del'] = self.effective_erosions[erosion]
        line['input_seqs'] = self.final_seqs
        line['indelfos'] = self.indelfos
        line['seqs'] = [self.indelfos[iseq]['reversed_seq'] if indelutils.has_indels(self.indelfos[iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs']))]
        self.set_ids(line, irandom=irandom)
        treeutils.translate_labels(self.tree, zip(self.leaf_names, line['unique_ids']))  # ordering in <self.leaf_names> is set in recombinator.add_mutants()
        line['affinities'] = [None for _ in line['unique_ids']]
        line['tree'] = self.tree.as_string(schema='newick')

        utils.add_implicit_info(self.glfo, line)

        self.line = line
Esempio n. 2
    def setline(self, irandom=None):  # don't access <self.line> directly
        if self.line is not None:
            return self.line

        line = {}
        for region in utils.regions:
            line[region + '_gene'] = self.genes[region]
        for boundary in utils.boundaries:
            line[boundary + '_insertion'] = self.insertions[boundary]
        for boundary in utils.effective_boundaries:
            line[boundary + '_insertion'] = ''
        for erosion in utils.real_erosions:
            line[erosion + '_del'] = self.erosions[erosion]
        for erosion in utils.effective_erosions:
            line[erosion + '_del'] = self.effective_erosions[erosion]
        line['input_seqs'] = self.final_seqs
        line['indelfos'] = self.indelfos
        line['seqs'] = [
            line['indelfos'][iseq]['reversed_seq'] if indelutils.has_indels(
                line['indelfos'][iseq]) else line['input_seqs'][iseq]
            for iseq in range(len(line['input_seqs']))
        self.set_ids(line, irandom)

        utils.add_implicit_info(self.glfo, line)

        self.line = line
Esempio n. 3
 def print_event(self):
     line = {}  # collect some information into a form that the print fcn understands
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['input_seqs'] = self.final_seqs
     line['indel_reversed_seqs'] = []
     for iseq in range(len(self.indelfos)):
         if self.indelfos[iseq]['reversed_seq'] != '':
     line['seqs'] = line['indel_reversed_seqs']
     line['indelfos'] = self.indelfos
     line['unique_ids'] = [str(i) for i in range(len(self.final_seqs))]
     line['cdr3_length'] = self.cdr3_length
     line['codon_positions'] = copy.deepcopy(self.final_codon_positions)
     utils.add_implicit_info(self.glfo, line)
     utils.print_reco_event(self.glfo['seqs'], line)
Esempio n. 4
    def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions):
        assert query_name not in['queries'].append(query_name)[query_name] = {}[query_name]['unique_id'] = query_name  # redundant, but used somewhere down the line[query_name]['k_v'] = kvals['v'][query_name]['k_d'] = kvals['d'][query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j'])  # all gene matches for this query[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3[query_name]['cyst_position'] = codon_positions['v'][query_name]['tryp_position'] = codon_positions['j']

        # erosion, insertion, mutation info for best match[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0][query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1]  # len(germline v) - gl_match_end[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0][query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1][query_name]['j_5p_del'] = all_germline_bounds[best['j']][0][query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1][query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]][query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]][query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]][query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ][query_name]['indelfo'] =['indels'].get(query_name, utils.get_empty_indel())

        for region in utils.regions:
  [query_name][region + '_gene'] = best[region]
  ['all_matches'][region] |= set(match_names[region])[query_name]['seq'] = query_seq  # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence

        existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position'])
        utils.add_implicit_info(self.glfo,[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys)

        if self.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str='      ', label='true:')
            utils.print_reco_event(self.glfo['seqs'],[query_name], extra_str='      ', label='inferred:')

        if self.alfinder is not None:
        if self.pcounter is not None:
            if self.true_pcounter is not None:
        if self.perfplotter is not None:
            if query_name in['indels']:
                print '    skipping performance evaluation of %s because of indels' % query_name  # I just have no idea how to handle naive hamming fraction when there's indels

Esempio n. 5
def read_annotations(fname, glfo):
    annotations = {}
    with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile:
        reader = csv.DictReader(csvfile)
        for line in reader:  # there's a line for each cluster
            if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            )  # converts strings in the csv file to floats/ints/dicts/etc.
                glfo, line
            )  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
            # utils.print_reco_event(line)  # print ascii-art representation of the rearrangement event
            annotations[getkey(line['unique_ids'])] = line
    return annotations
Esempio n. 6
    def try_scratch_erode_insert(self, tmpline, debug=False):
        for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
            region = erosion[0]
            gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
            if region == 'd' and not utils.has_d_gene(  # dummy d genes: always erode the whole thing from the left
                assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[]
                tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0
                max_erosion = max(0, gene_length/2 - 2)  # heuristic
                if region in utils.conserved_codons[]:  # make sure not to erode a conserved codon
                    codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene'])
                    if '3p' in erosion:
                        n_bases_to_codon = gene_length - codon_pos - 3
                    elif '5p' in erosion:
                        n_bases_to_codon = codon_pos
                    max_erosion = min(max_erosion, n_bases_to_codon)
                tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
        for bound in utils.boundaries:
            mean_length = utils.scratch_mean_insertion_lengths[][bound]
            length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
            probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
            tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

        if debug:
            print '    erosions:  %s' % ('   '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions]))
            print '    insertions:  %s' % ('   '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries]))

        # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
        gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
        for erosion in utils.real_erosions:
            region = erosion[0]
            e_length = tmpline[erosion + '_del']
            if '5p' in erosion:
                gl_seqs[region] = gl_seqs[region][e_length:]
            elif '3p' in erosion:
                gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
        tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
        tmpline['unique_ids'] = [None]  # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences
        tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs'])  # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync
        tmpline['indelfos'] = [indelutils.get_empty_indel(), ]
        utils.add_implicit_info(self.glfo, tmpline)
        assert len(tmpline['in_frames']) == 1
Esempio n. 7
 def print_event(self):
     line = {}  # collect some information into a form that print_reco_event understands
     # line['cdr3_length'] = self.cdr3_length
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     # line['cyst_position'] = self.final_cyst_position
     # line['tryp_position'] = self.final_tryp_position
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['seqs'] = self.final_seqs
     line['unique_ids'] = [i for i in range(len(self.final_seqs))]
     utils.add_implicit_info(self.glfo, line, multi_seq=True)
     utils.print_reco_event(self.glfo['seqs'], line, indelfos=self.indelfo)
Esempio n. 8
        def try_scratch_erode_insert(tmpline):
            for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
                region = erosion[0]
                gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
                if self.args.chain != 'h' and region == 'd':  # light chains dummy d treatment
                    assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.chain]
                    tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0  # always erode the whole dummy d from the left
                    max_erosion = max(0, gene_length/2 - 2)  # now that, son, is a heuristic
                    if region in utils.conserved_codons[self.args.chain]:
                        codon_pos = self.glfo[utils.conserved_codons[self.args.chain][region] + '-positions'][tmpline[region + '_gene']]
                        if '3p' in erosion:
                            n_bases_to_codon = gene_length - codon_pos - 3
                        elif '5p' in erosion:
                            n_bases_to_codon = codon_pos
                        max_erosion = min(max_erosion, n_bases_to_codon)
                    tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
            for bound in utils.boundaries:
                mean_length = utils.scratch_mean_insertion_lengths[self.args.chain][bound]
                length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
                probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
                tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

            # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
            gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
            for erosion in utils.real_erosions:
                region = erosion[0]
                e_length = tmpline[erosion + '_del']
                if '5p' in erosion:
                    gl_seqs[region] = gl_seqs[region][e_length:]
                elif '3p' in erosion:
                    gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
            tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
            tmpline['indelfos'] = [utils.get_empty_indel(), ]
            utils.add_implicit_info(self.glfo, tmpline)
            assert len(tmpline['in_frames']) == 1
Esempio n. 9
def read_sequence_file(infname,
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(
        )  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fastx']:
        reader = utils.read_fastx(
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(
        )  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
        raise Exception('unhandled file extension %s' % suffix)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                if iline >= args.istartstop[1]:
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if suffix != '.yaml':
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(
                    potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if any(c not in utils.alphabet for c in inseq):
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
            'seqs': [

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[
                        uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(
                    )  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:
            print '  %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % (
                utils.color('red', 'note:'),
                len(set(more_input_info) & set(input_info)),
                ' '.join(set(more_input_info) & set(input_info))
            )  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
    if args is not None and args.input_metafname is not None:
    post_process(input_info, reco_info, args, infname, found_seed, is_data,

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
Esempio n. 10
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/ --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of right now
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        imin = aa_hdists.index(
        )  # NOTE doesn't do anything differently if there's more than one min

    return final_line
Esempio n. 11
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/ --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of right now
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Esempio n. 12
def get_seqfile_info(infname,
    """ return list of sequence info from files of several types """

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(infname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
        reader = utils.read_fastx(
            queries=(args.queries if args is not None else None),

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                if iline >= args.istartstop[1]:
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            raise Exception(
                'unexpected character (not among %s) in input sequence with id %s:\n  %s'
                % (utils.nukes + utils.ambiguous_bases, uid, inseq))

        input_info[uid] = {
            'unique_ids': [
            'seqs': [

        if n_queries_added == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:

    post_process(input_info, reco_info, args, infname, found_seed, is_data)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
# formatting necessity
def getkey(uid_list):
    return ':'.join(uid_list)

# creates a dictionary with keys = unique_ids and values = annotations
annotations = {}
with open(args.infile.replace('.csv', '-cluster-annotations.csv')) as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:  # there's a line for each cluster
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            line)  # converts strings in the csv file to floats/ints/dicts/etc.
            glfo, line
        )  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
        # utils.print_reco_event(line)  # print ascii-art representation of the rearrangement event
        annotations[getkey(line['unique_ids'])] = line

# sort by size
sorted_clusters = sorted(annotations,
                         key=lambda q: len(annotations[q]['unique_ids']),

#### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences

# total size of repertoire (number sequences)
n_total = sum([len(cluster) for cluster in sorted_clusters])

# add more criteria
Esempio n. 14
def get_seqfile_info(infname,
    """ return list of sequence info from files of several types """

    suffix = utils.getsuffix(infname)
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
        reader = utils.read_fastx(
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                if iline >= args.istartstop[1]:
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
            'seqs': [

        if n_queries_added == 0 and is_data and 'reco_id' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(

    post_process(input_info, reco_info, args, infname, found_seed, is_data,

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
Esempio n. 15
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None):
    """ return list of sequence info from files of several types """

    suffix = os.path.splitext(fname)[1]
    if suffix == '.csv':
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif suffix == '.tsv':
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
            raise Exception('couldn\'t handle file extension for %s' % fname)
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        for seq_record in SeqIO.parse(fname, ftype):

            # if command line specified query or reco ids, skip other ones
            if queries is not None and not in queries:
            # if reco_ids is not None and line['reco_id'] not in reco_ids:  # probably no reco ids in a fasta file
            #     continue

            reader[-1][name_column] =
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    for line in reader:
        if '.csv' in fname and name_column not in line:  # hackey hackey hackey
            name_column = 'name'
            seq_column = 'nucleotide'
        unique_id = line[name_column]
        if ':' in unique_id:
            raise Exception('found a \':\' in sequence id \'%s\' -- you\'ll have to replace it with something else, as we use \':\'s internally to concatenate sequence ids' % unique_id)

        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
        if reco_ids is not None and line['reco_id'] not in reco_ids:

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]}
        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname)
            reco_info[unique_id] = dict(line)
            if 'indels' in line and line['indels']['reversed_seq'] != '':  # TODO unhackify this
                reco_info[unique_id]['seq'] = line['indels']['reversed_seq']
            if 'indels' not in line:  # TODO unhackify this
                reco_info[unique_id]['indels'] = None
            if glfo is not None:
                utils.remove_implicit_info(reco_info[unique_id], multi_seq=False)
                utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False)  # each seq is on its own line in the file
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    return (input_info, reco_info)
Esempio n. 16
def partis_naive_seq(lseq, fnam):
    Given a number of sequences infer the naive sequence using partis.
    # Specify filenames:
    pretty_random_fnam = str(random.randint(1, 10**100))
    inpf = pretty_random_fnam + '_input'
    outf = pretty_random_fnam + '_output'
    # Write input fasta file for partis:
    with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho:
        for i, s in enumerate(lseq):
            fho.write('>{}\n{}\n'.format(str(i), s))
    # Run partis:
    cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf)
    # os.system(cmd)  # Print partis STDOUT to screen
    os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam))

        # Read the partis output file and extract the naive sequence:
        with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh:
            reader = csv.DictReader(fh)
            data = list(reader)
        # assert(len(data) == 1)  # There should really only be one clonal family, but there often are, so just take the first (largest)
        # Extract germline bounds info and trim the naive DNA sequence:
            utils.process_input_line(data[0])       # Process dataframe row
            fnam_base = fnam.split('_partitions')[0].split('/')
            #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
            glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
            utils.add_implicit_info(glfo, data[0])  # Adding germline infor
        except Exception as e:
            print e
            raise e

        naiveDNA = data[0]['naive_seq'][:]
        first_lseq = data[0]['input_seqs'][:][0]
        vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1])
        naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds)
        first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds)
            assert(len(first_lseq) == len(naiveDNA))
            print 'len(first_lseq) != len(data[0]["naive_seq"])'
            print len(first_lseq)
            print first_lseq
            print len(naiveDNA)
            print naiveDNA
        # If the inferred naive sequence contains a stop codon replace it by the input sequence codon:
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.'
            print 'Before replacement:', naiveDNA
            naiveDNA_l = list(naiveDNA[:])
            for codon in range(vj_bounds[0], vj_bounds[1], 3):
                if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()):
                    naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3]
            naiveDNA = ''.join(naiveDNA_l)
            print 'After replacement:', naiveDNA
        if naiveDNA == first_lseq:
            print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)'

        # Clean up:
        os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
Esempio n. 17
def extract_seqs(fnam):
    Reads a partis cluster-annotations file and extracts relevant information and sequences.
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()

    if args.allele_finding:
        fnam_base = fnam.split('_partitions')[0].split('/')
        glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
        glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
    for row in data:
        # Process the partis data row and add germline information:
            # Read default germline info
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            if 'failed annotation' not in e:
                # print('First skip')
                # print(e)
                print 'Reading from'
                print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1])
                print e

#        # Process the partis data row and add germline information:
#        try:
#            utils.process_input_line(row)
#            utils.add_implicit_info(glfo, row)
#        except:  # Skip rows that cannot be processed
#            continue

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        naiveDNA = row['naive_seq']
        # Skip naive sequences too short or with stop codons:
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except is meant to provide backwards compatability:
            lseq = row['input_seqs'][:]
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length (QC):
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
        # And mutation frequencies:
        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts:
        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the duplications on amino acid level:
        lAAseq_dict = dict()
        lseq_unique = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_dict:
                lAAseq_dict[aa] = [i]
                lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds))
        assert(len(lAAseq_dict) == len(lseq_unique))
        # Make the deduplicated sequence list and the mutation rates:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            Nmut_list = [float(Nmuts[i]) for i in idxs]
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(lseq_unique))

        # Exclude small clonal families after all the QC and deduplication:
        if len(lAAseq_dedup) < args.MIN_OBS:

        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:],
                       'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]})
    return(sequences_i, info_i)
Esempio n. 18
def write_partis_data_from_annotations(
    Function to read partis annotations csv

    @param path_to_annotations: path to annotations files
    @param metadata: csv file of metadata; if None defaults will be used for chain/species
    @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain;
        filters out families, e.g., {'locus': ['igk']}, etc.
    @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will
        only retain sequences that are out of frame and did not have an indel
    @param min_clonal_family_size: minimum clonal family size
    @param min_seq_len: minimum sequence length
    @param max_mut_pct: maximum mutation percentage
    @param min_mut_pct: minimum mutation percentage
    @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets)
    @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj')
    @param germline_family: for performing cross validation ('v', 'd', or 'j')

    @write genes to output_genes and seqs to output_seqs

    families = ['v', 'd', 'j']
    if germline_family not in families:
        raise ValueError("Invalid germline_family: %s. Must be one of %s" %
                         (germline_family, families))

    regions = ['v', 'd', 'j', 'vdj']
    if region not in regions:
        raise ValueError("Invalid region: %s. Must be one of %s" %
                         (region, regions))

    PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
    sys.path.insert(1, PARTIS_PATH + '/python')
    from utils import add_implicit_info, process_input_line
    import glutils

    partition_info = get_partition_info(

    with open(output_genes, 'w') as genes_file, open(output_seqs,
                                                     'w') as seqs_file:
        gene_writer = csv.DictWriter(genes_file,
                                     ['germline_name', 'germline_sequence'])

        seq_header = [

        for key, _ in partition_info[0].iteritems():
            seq_header += [key]

        seq_writer = csv.DictWriter(seqs_file, seq_header)
        for data_idx, data_info in enumerate(partition_info):
            if any([
                    data_info[key] not in values
                    for key, values in filters.iteritems()
            glfo = glutils.read_glfo(data_info['germline_file'],
            with open(data_info['annotations_file'], "r") as csvfile:
                reader = csv.DictReader(csvfile)
                for idx, line in enumerate(reader):
                    if line['v_gene'] == '':
                        # failed annotations

                    # add goodies from partis
                    add_implicit_info(glfo, line)
                    n_seqs = len(line['input_seqs'])
                    if n_seqs < min_clonal_family_size:
                        # don't take small clonal families---for data quality purposes

                    if region == 'vdj':
                        gl_seq = line['naive_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['seqs']]
                        gl_seq = line['v_gl_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['v_qr_seqs']]

                    idx_list = []
                    # frequency filter
                            i for i, val in enumerate(line['mut_freqs'])
                            if val < max_mut_pct and val >= min_mut_pct
                    # sequence length filter
                            i for i, val in enumerate(all_seqs)
                            if len(val.translate(None, 'n')) > min_seq_len
                    for key, values in seq_filters.iteritems():
                                i for i, val in enumerate(line[key])
                                if val in values

                    good_seq_idx = set.intersection(*idx_list)
                    if not good_seq_idx:
                        # no sequences after filtering... skip

                    gl_name = 'clone{}-{}-{}'.format(
                        *[data_idx, idx, clone_str])
                        'germline_name': gl_name,
                        'germline_sequence': gl_seq,

                    for good_idx in good_seq_idx:
                        base_dict = {
                            '-'.join([gl_name, line['unique_ids'][good_idx]]),

                        for key, value in data_info.iteritems():
                            base_dict[key] = value

Esempio n. 19
def run_partis(seq):
    Infer VDJ genes and the naive sequence using partis.
    # Specify filenames:
    pretty_random_fnam = str(random.randint(1, 10**100))
    inpf = pretty_random_fnam + '_input'
    outf = pretty_random_fnam + '_output'
    # Write input fasta file for partis:
    with open(TMPDIR + '/' + inpf + '.fa', 'w') as fho:
        fho.write('>{}\n{}\n'.format('input_sequence', seq))
    # Run partis:
    cmd = '{}/bin/partis annotate --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(
        partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf)
    os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam))

        # Read the partis output file and extract the naive sequence:
        with open(TMPDIR + '/' + outf + '.csv') as fh:
            reader = csv.DictReader(fh)
            data = list(reader)
        ann = data[0]
        # Extract germline bounds info and trim the naive DNA sequence:
            utils.process_input_line(ann)  # Process dataframe row
            utils.add_implicit_info(glfo, ann)  # Adding germline infor
        except Exception as e:
            print e
            raise e

        if ann['stops'] is True:
            raise Exception(
                'Input sequence contain stop codon. This is no valid.')
        elif ann['v_5p_del'] > 30 or ann['j_3p_del'] > 12:
            raise Exception(
                'Incomplete input sequence error. 5-prime end missing {} nt and 3-prime missing {} nt. Max allowed is 30 and 12, respectively.'
                .format(ann['v_5p_del'], ann['j_3p_del']))
        elif ann['indelfos'][0]['indels']:
            raise Exception(
                'Input sequence contains indels, this is currently not supported.'

        # Extract full size VDJ sequence for both the inferred naive and the input:
        full_gl_v = glfo['seqs']['v'][ann['v_gene']]  # Germline V
        full_gl_j = glfo['seqs']['j'][ann['j_gene']]  # Germline J

        gl_v_5p_del = full_gl_v[:ann[
            'v_5p_del']]  # 5-prime not included in input
        gl_j_3p_del = full_gl_j[(
            len(full_gl_j) -
            ann['j_3p_del']):]  # 3-prime not included in input
        #assert full_gl_v[ann['v_5p_del']:] == ann['v_gl_seq']
        naiveDNA = gl_v_5p_del + ann[
            'naive_seq'] + gl_j_3p_del  # Add the missing positions
        full_input_seq = 'N' * ann['v_5p_del'] + ann['input_seqs'][
            0] + 'N' * ann['j_3p_del']  # N pad the input sequence
        assert (len(naiveDNA) == len(full_input_seq))

        # Remove the untranslated end:
        if len(naiveDNA) % 3 != 0:
            naiveDNA = naiveDNA[0:-(len(naiveDNA) % 3)]
        if len(full_input_seq) % 3 != 0:
            full_input_seq = full_input_seq[0:-(len(full_input_seq) % 3)]
        if len(naiveDNA) != len(full_input_seq):
            raise Exception(
                'Sequences not equally long after trimming.\nInput: {}\nNaive: {}\n.'
                .format(full_input_seq, naiveDNA))

        # Replace Ns in input sequence with naive DNA bases:
        full_input_seq = repair_seq(full_input_seq, naiveDNA[:])

        # If the inferred naive sequence contains a stop codon replace it by the input sequence codon:
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.'
            print 'Before replacement:', naiveDNA
            naiveDNA_l = list(naiveDNA[:])
            for codon in range(0, len(naiveDNA), 3):
                if '*' == str(
                        Seq(naiveDNA[codon:codon + 3],
                    naiveDNA_l[codon:codon + 3] = full_input_seq[codon:codon +
            naiveDNA = ''.join(naiveDNA_l)
            print 'After replacement:', naiveDNA
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            raise Exception('Naive sequence could not be repaired.')
        if naiveDNA == full_input_seq:
            print 'Warning: input sequence is identical to the inferred naive sequence.'
        # Clean up:
        os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR,
    return (naiveDNA, full_input_seq, (ann['v_gene'], ann['d_gene'],
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
Esempio n. 21
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/ --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Esempio n. 22
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None, name_column=None, seq_column=None, seed_unique_id=None, abbreviate_names=False):
    """ return list of sequence info from files of several types """

    # WARNING defaults for <name_column> and <seq_column> also set in partis (since we call this from places other than partis, but we also want people to be able set them from the partis command line)
    internal_name_column = 'unique_id'  # key we use in the internal dictionaries
    internal_seq_column = 'seq'
    if name_column is None:  # header we expect in the file
        name_column = internal_name_column
    if seq_column is None:
        seq_column = internal_seq_column

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(fname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
            assert False
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
            raise Exception('couldn\'t handle file extension for %s' % fname)
        reader = []
        n_fasta_queries = 0
        for seq_record in SeqIO.parse(fname, ftype):

            # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq])
            if queries is not None and not in queries:

            reader[-1][name_column] =
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if abbreviate_names:
        potential_names = list(string.ascii_lowercase)
    for line in reader:
        if name_column not in line or seq_column not in line:
            raise Exception('mandatory headers \'%s\' and \'%s\' not both present in %s    (you can set column names with --name-column and --seq-column)' % (name_column, seq_column, fname))
        if name_column != internal_name_column or seq_column != internal_seq_column:
            translate_columns(line, {name_column : internal_name_column, seq_column: internal_seq_column})
        unique_id = line[internal_name_column]

        ## Actually deal with colons properly since they come up VERY OFTEN in sequence IDs
        unique_id = unique_id.replace(":", "_")
        if any(fc in unique_id for fc in utils.forbidden_characters):
            raise Exception('found a forbidden character (one of %s) in sequence id \'%s\' -- sorry, you\'ll have to replace it with something else' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), unique_id))

        if abbreviate_names:
            unique_id = abbreviate(used_names, potential_names, unique_id)

        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
        if reco_ids is not None and line['reco_id'] not in reco_ids:

        if unique_id in input_info:
            raise Exception('found id %s twice in file %s' % (unique_id, fname))

        if seed_unique_id is not None and unique_id == seed_unique_id:
            found_seed = True

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[internal_seq_column]}

        if n_queries == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % fname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % fname)
            reco_info[unique_id] = copy.deepcopy(line)
            reco_info[unique_id]['unique_id'] = unique_id  # in case we're abbreviating
            if glfo is not None:
                utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False, existing_implicit_keys=('cdr3_length', ))  # single seqs, since each seq is on its own line in the file

        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    if seed_unique_id is not None and not found_seed:
        raise Exception('couldn\'t find seed %s in %s' % (seed_unique_id, fname))

    return (input_info, reco_info)
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
        utils.add_implicit_info(glfo, line)

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
Esempio n. 24
def _get_clonal_family_stats(path_to_annotations,
    get data statistics from partis annotations

    @param path_to_annotations: path to partis annotations
    @param metadata: path to partis metadata 
    @param use_np: use nonproductive seqs?
    @param use_immunized: for Cui data, use immunized mice?
    @param locus: which locus to use

    @return list of dicts with clonal family sizes and naive seqs from processed data

    partition_info = get_partition_info(

    if use_np:
        # return only nonproductive sequences
        # here "nonproductive" is defined as having a stop codon or being
        # out of frame or having a mutated conserved cysteine
        good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[
        # return all sequences
        good_seq = lambda seqs: [True for seq in seqs['seqs']]

    all_germline_dicts = []
    for data_idx, data_info in enumerate(partition_info):
        if use_immunized and data_info['group'] != 'immunized':
        if not locus or data_info['locus'] != locus:
        PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
        sys.path.insert(1, PARTIS_PATH + '/python')
        from utils import add_implicit_info, process_input_line
        import glutils
        glfo = glutils.read_glfo(data_info['germline_file'],
        with open(data_info['annotations_file'], "r") as csvfile:
            reader = csv.DictReader(csvfile)
            for idx, line in enumerate(reader):
                # add goodies from partis
                if len(line['input_seqs']) == 0:
                    # sometimes data will have empty clusters
                add_implicit_info(glfo, line)
                good_seq_idx = [
                    i for i, is_good in enumerate(good_seq(line)) if is_good
                if not good_seq_idx:
                    # no nonproductive sequences... skip
                        '-'.join([line['v_gene'], str(idx)]),

    return all_germline_dicts
Esempio n. 25
    def get_mature_line(sfos,
        assert len(
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids

        if args.debug:

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
        )  # I don't want to move the function out of right now
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line
Esempio n. 26
def extract_seqs(fnam, uid2iso):
    '''Reads a partis cluster-annotations files and extrats relevant information and sequences.'''
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()
    for row in data:
        fnam_base = fnam.split('_partitions')[0]
        cwd = os.getcwd()
        if 'IgK' in fnam_base:
            locus = 'igk'
        elif 'IgL' in fnam_base:
            locus = 'igl'
            locus = 'igh'
        # Process the partis data row and add germline information:
            # Read default germline info
            glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus)
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            print('First skip')

        uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])]

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0:
            print('Invalid clonal family, skipping.')

        naiveDNA = row['naive_seq']
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:  # Skip naive sequences too short or with stop codons:
            # print('Third skip')
            if len(row['input_seqs'][:]) > 100:
                print('Bad naive even after 100 seqs in clonal family.')
                repair_seq_debug(naiveDNA, naiveDNA, vj_bounds)
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except
        # is meant to provide backwards compatability:
            lseq = row['input_seqs'][:]
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length:
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Exclude small clonal families:
        if len(lseq) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
#        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
#        print(row['n_mutations'].split(':'))
        Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1]
        abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1]
        uids = [s for s, keep in zip(uids, keep_idx) if keep == 1]
        assert(len(Nmuts) == len(lseq))
        assert(len(abundance) == len(lseq))
        assert(len(uids) == len(lseq))
#        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts and throw out info for discarded sequences:
#        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the AA deduplication:
        lAAseq_dict = dict()
        lAAseq_sort = dict()
        lseq_dedup = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_sort:
                lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i]))
                lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])]

        for i, aa in enumerate(lAAseq_sort):
            lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]]
            s = sorted(lAAseq_sort[aa], )
            ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1]

        assert(len(lAAseq_dict) == len(lseq_dedup))
        # Make the deduplicated list and take the mutation rates,
        #  as the mutation rate for the deduplicated sequence:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        abundance_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            Nmut_list = [float(Nmuts[i]) for i in idxs]
            abundance_list = [abundance[i] for i in idxs]
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(abundance_dedup))
        assert(len(lAAseq_dedup) == len(lseq_dedup))

        # Exclude small clonal families:
        if len(lAAseq_dedup) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
        iso_list = [[uid2iso[u] for u in ul] for ul in uids]
        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:],
                       'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:],
                       'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]})

    return(sequences_i, info_i)
Esempio n. 27
def process_cluster(args, cluster_line, seed_id, glfo):
    utils.add_implicit_info(glfo, cluster_line)

    if (seed_id is not None and not args.match_indel_in_uid
            and not args.ignore_seed_indels):
        check_seed_for_indels(cluster_line, seed_id, args.partition_file)
    # assume we want all seqs in cluster
    iseqs_to_keep = set(range(len(cluster_line["input_seqs"])))
    # write out matching indel-containing seqs for visualization if --show-indel-in-trees
    if args.show_indel_in_trees:
        matching_iseqs = set(
            match_indel_in_uid_seq(cluster_line, args.show_indel_in_trees))
        match_info = {
            "indel_match": [iseq in matching_iseqs for iseq in iseqs_to_keep]
    # various cases where we downsample cluster sequences
    if args.match_indel_in_uid:
        iseqs_to_keep = iseqs_to_keep & set(
            match_indel_in_uid_seq(cluster_line, args.match_indel_in_uid))
    if args.largest_cluster_across_partitions:
        Deduplicate sequence records. When using largest_cluster_across_partitions for seeded clusters, we may end up with duplicate sequences in 
        these clusters because of how partis partitions seed clusters. If this option used, beware that this deduplication pays no respect to which 
        duplicate record is preserved of two with the same unique id.
        iseqs_to_keep = iseqs_to_keep & set({
            unique_id: iseq
            for iseq, unique_id in enumerate(cluster_line["unique_ids"])
    if args.remove_frameshifts or args.remove_stops or args.remove_mutated_invariants:
        iseqs_to_keep = iseqs_to_keep & set(apply_filters(args, cluster_line))
    # apply merging of multiplicity info here (or flesh out with default values otherwise)
    multiplicity_seqmeta = get_multiplicity_seqmeta(cluster_line,

    # apply sequence downsampling here
    cluster_line["unique_seqs_count"] = len(
        iseqs_to_keep)  # total in cluster output from partis
    always_include = set(args.always_include + [args.inferred_naive_name])
    if args.max_sequences:
        iseqs_to_keep = iseqs_to_keep & set(
                cluster_line, multiplicity_seqmeta, args.max_sequences,
    cluster_line["sampled_seqs_count"] = len(iseqs_to_keep)

    # filter cluster line to iseqs_to_keep
    utils.restrict_to_iseqs(cluster_line, iseqs_to_keep, glfo)

    # add the additional info computed in above for the iseqs we care about
    cluster_line = add_additional_info(cluster_line, multiplicity_seqmeta,
    if args.show_indel_in_trees:
        cluster_line = add_additional_info(cluster_line, match_info,

    cluster_line["total_read_count"] = sum(
    )  # total reads accounting for multiplicity (must be calculated after subsetting cluster in restrict_to_iseqs if it should correspond to total reads represented by subset of cluster returned by restrict_to_iseqs)
    # this needs to happen after restrict_to_iseqs re-adds implicit partis linekeys including 'regional_bounds'
    cluster_line, regional_bounds_keys = add_regional_bounds(cluster_line)
    return merge(
            regional_bounds_keys + [
        get_cluster_meta_dict(cluster_line, seed_id, args),
Esempio n. 28
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None):
    """ return list of sequence info from files of several types """

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(infname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
            assert False
        seqfile = opener('r')(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
            raise Exception('couldn\'t handle file extension for %s' % infname)
        reader = []
        n_fasta_queries = 0
        already_printed_forbidden_character_warning = False
        for seq_record in SeqIO.parse(infname, ftype):

            # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq])
            if args is not None and args.queries is not None and not in args.queries:


            uid =
            if any(fc in uid for fc in utils.forbidden_characters):
                if not already_printed_forbidden_character_warning:
                    print '  %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations))
                    already_printed_forbidden_character_warning = True
                uid = uid.translate(utils.forbidden_character_translations)

            reader[-1]['unique_ids'] = uid
            reader[-1]['input_seqs'] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                if iline >= args.istartstop[1]:
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seqs' not in line and 'seq' not in line:
            raise Exception('couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        inseq = line['input_seqs'][0]

        # NOTE I just moved this to the .fa loop, since otherwise we have no way of knowing how to interpret special characters... nevertheless if someone passesin a csv with special characters as part of a uid this will break
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     if not already_printed_forbidden_character_warning:
        #         print '  %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations))
        #         already_printed_forbidden_character_warning = True
        #     uid = uid.translate(utils.forbidden_character_translations)
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
            if args.reco_ids is not None and line['reco_id'] not in args.reco_ids:
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            raise Exception('unexpected character (not among %s) in input sequence with id %s:\n  %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq))

        input_info[uid] = {'unique_ids' : [uid, ], 'seqs' : [inseq, ]}

        if n_queries_added == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:

    if args is not None:
        if args.istartstop is not None:
            n_lines_in_file = iline + 1
            if n_lines_in_file < args.istartstop[1]:
                raise Exception('--istartstop upper bound %d larger than number of lines in file %d' % (args.istartstop[1], n_lines_in_file))
        if len(input_info) == 0:
            if args.queries is not None:
                raise Exception('didn\'t find the specified --queries (%s) in %s' % (str(args.queries), infname))
            if args.reco_ids is not None:
                raise Exception('didn\'t find the specified --reco-ids (%s) in %s' % (str(args.reco_ids), infname))
        if args.queries is not None:
            missing_queries = set(args.queries) - set(input_info)
            extra_queries = set(input_info) - set(args.queries)  # this is just checking for a bug in the code just above here...
            if len(missing_queries) > 0:
                raise Exception('didn\'t find some of the specified --queries: %s' % ' '.join(missing_queries))
            if len(extra_queries) > 0:
                raise Exception('extracted uids %s that weren\'t specified with --queries' % ' '.join(extra_queries))
        if args.seed_unique_id is not None:
            if found_seed:
                if args.seed_seq is not None:  # and input_info[args.seed_unique_id]['seqs'][0] != args.seed_seq:
                    # raise Exception('incompatible --seed-unique-id and --seed-seq (i.e. the sequence in %s corresponding to %s wasn\'t %s)' % (infname, args.seed_unique_id, args.seed_seq))
                    raise Exception('--seed-seq was specified, but --seed-unique-id was also present in input file')
                if args.seed_seq is None:
                    raise Exception('couldn\'t find seed unique id %s in %s' % (args.seed_unique_id, infname))
                add_seed_seq(args, input_info, reco_info, is_data)
        elif args.seed_seq is not None:
            args.seed_unique_id = 'seed-seq'
            add_seed_seq(args, input_info, reco_info, is_data)
        elif args.random_seed_seq:  # already checked (in bin/partis) that other seed args aren't set
            args.seed_unique_id = random.choice(input_info.keys())
            print '    chose random seed unique id %s' % args.seed_unique_id

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info