Ejemplo n.º 1
0
 def add_shm_indels(self, reco_event):
     # NOTE that it will eventually make sense to add shared indel mutation according to the chosen tree -- i.e., probably, with some probability apply an indel instead of a point mutation
     if self.args.debug and self.args.indel_frequency > 0.:
         print '      indels'
     reco_event.indelfos = [
         indelutils.get_empty_indel()
         for _ in range(len(reco_event.final_seqs))
     ]
     for iseq in range(len(reco_event.final_seqs)):
         if self.args.indel_frequency == 0.:  # no indels at all
             continue
         if numpy.random.uniform(
                 0, 1
         ) > self.args.indel_frequency:  # no indels for this sequence
             if self.args.debug:
                 print '        0'
             continue
         reco_event.indelfos[iseq]['reversed_seq'] = reco_event.final_seqs[
             iseq]  # set the original sequence (i.e. with all the indels reversed)
         n_indels = numpy.random.geometric(
             1. / self.args.mean_indels_per_indeld_seq)
         if self.args.debug:
             print '        %d' % n_indels
         for _ in range(n_indels):
             # NOTE modifies <indelfo> and <codon_positions>
             reco_event.final_seqs[iseq] = indelutils.add_single_indel(
                 reco_event.final_seqs[iseq],
                 reco_event.indelfos[iseq],
                 self.args.mean_indel_length,
                 reco_event.final_codon_positions[iseq],
                 indel_location=self.args.indel_location,
                 debug=self.args.debug)
Ejemplo n.º 2
0
 def add_shm_indels(self, reco_event):
     # NOTE that it will eventually make sense to add shared indel mutation according to the chosen tree -- i.e., probably, with some probability apply an indel instead of a point mutation
     if self.args.debug and self.args.indel_frequency > 0.:
         print '      indels'
     reco_event.indelfos = [
         indelutils.get_empty_indel()
         for _ in range(len(reco_event.final_seqs))
     ]
     for iseq in range(len(reco_event.final_seqs)):
         if self.args.indel_frequency == 0.:  # no indels at all
             continue
         if numpy.random.uniform(
                 0, 1
         ) > self.args.indel_frequency:  # no indels for this sequence
             if self.args.debug:
                 print '        0'
             continue
         n_indels = numpy.random.choice(self.args.n_indels_per_indeld_seq)
         input_seq, indelfo = indelutils.add_indels(
             n_indels,
             reco_event.final_seqs[iseq],
             reco_event.
             recombined_seq,  # NOTE modifies <indelfo> and <codon_positions>
             self.args.mean_indel_length,
             reco_event.final_codon_positions[iseq],
             indel_location=self.args.indel_location,
             dbg_pad=8,
             debug=self.args.debug)
         reco_event.final_seqs[iseq] = input_seq
         indelfo['genes'] = {r: reco_event.genes[r] for r in utils.regions}
         reco_event.indelfos[iseq] = indelfo
Ejemplo n.º 3
0
    def try_scratch_erode_insert(self, tmpline, debug=False):
        utils.remove_all_implicit_info(tmpline)
        for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
            region = erosion[0]
            gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
            if region == 'd' and not utils.has_d_gene(self.args.locus):  # dummy d genes: always erode the whole thing from the left
                assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus]
                tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0
            else:
                max_erosion = max(0, gene_length/2 - 2)  # heuristic
                if region in utils.conserved_codons[self.args.locus]:  # make sure not to erode a conserved codon
                    codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene'])
                    if '3p' in erosion:
                        n_bases_to_codon = gene_length - codon_pos - 3
                    elif '5p' in erosion:
                        n_bases_to_codon = codon_pos
                    max_erosion = min(max_erosion, n_bases_to_codon)
                tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
        for bound in utils.boundaries:
            mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound]
            length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
            probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
            tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

        if debug:
            print '    erosions:  %s' % ('   '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions]))
            print '    insertions:  %s' % ('   '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries]))

        # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
        gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
        for erosion in utils.real_erosions:
            region = erosion[0]
            e_length = tmpline[erosion + '_del']
            if '5p' in erosion:
                gl_seqs[region] = gl_seqs[region][e_length:]
            elif '3p' in erosion:
                gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
        tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
        tmpline['unique_ids'] = [None]  # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences
        tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs'])  # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync
        tmpline['indelfos'] = [indelutils.get_empty_indel(), ]
        utils.add_implicit_info(self.glfo, tmpline)
        assert len(tmpline['in_frames']) == 1
Ejemplo n.º 4
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                indelutils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data)
        # This chosen depth corresponds to the sequence-wide mutation frequency.
        # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region.
        # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file).
        # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
        # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        treefostr = self.treeinfo[random.randint(
            0,
            len(self.treeinfo) - 1
        )]  # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok.
        assert treefostr.count(';') == 1
        isplit = treefostr.find(';') + 1
        chosen_tree = treefostr[:isplit]  # includes semi-colon
        mutefo = [rstr for rstr in treefostr[isplit:].split(',')]
        mean_total_height = treegenerator.get_mean_height(chosen_tree)
        regional_heights = {
        }  # per-region height, including <self.args.mutation_multiplier>
        for tmpstr in mutefo:
            region, ratio = tmpstr.split(':')
            assert region in utils.regions
            ratio = float(ratio)
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                ratio *= self.args.mutation_multiplier
            regional_heights[region] = mean_total_height * ratio

        scaled_trees = {
            r: treegenerator.rescale_tree(chosen_tree, regional_heights[r])
            for r in utils.regions
        }

        if self.args.debug:
            print '  chose tree with total height %f' % treegenerator.get_mean_height(
                chosen_tree)
            print '    regional trees rescaled to heights:  %s' % ('   '.join([
                '%s %.3f  (expected %.3f)' %
                (region, treegenerator.get_mean_height(
                    scaled_trees[region]), regional_heights[region])
                for region in utils.regions
            ]))
            print treegenerator.get_ascii_tree(chosen_tree, extra_str='    ')

        n_leaves = treegenerator.get_n_leaves(chosen_tree)
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaves,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(
                len(utils.regions)
        ):  # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.)
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaves)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaves)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaves):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq, debug=self.args.debug
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event
            reco_event.final_codon_positions.append(
                copy.deepcopy(reco_event.post_erosion_codon_positions)
            )  # separate codon positions for each sequence, because of shm indels

        self.add_shm_indels(reco_event)

        reco_event.setline(
            irandom
        )  # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow

        self.check_tree_simulation(mean_total_height, regional_heights,
                                   scaled_trees, mseqs, reco_event)

        if self.args.debug:
            utils.print_reco_event(reco_event.line, extra_str='    ')