def check(self):
        total = 0.0
        for _, prob in self.transitions.iteritems():
            assert prob >= 0.0
            total += prob
        assert utils.is_normed(total)

        if self.name == 'init':  # no emissions for 'init' state
            return

        if self.emissions is not None:
            total = 0.0
            for _, prob in self.emissions['probs'].iteritems():
                assert prob >= 0.0
                total += prob
            assert utils.is_normed(total)

        if self.pair_emissions is not None:
            total = 0.0
            for letter1 in self.pair_emissions['probs']:
                for _, prob in self.pair_emissions['probs'][letter1].iteritems(
                ):
                    assert prob >= 0.0
                    total += prob
            assert utils.is_normed(total)
Esempio n. 2
0
    def read_mute_freqs(self, parameter_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        branch_lengths = {}
        for mtype in [
                'all',
        ] + utils.regions:
            branch_lengths[mtype] = {n: [] for n in ('lengths', 'probs')}
            mutehist = self.get_mute_hist(mtype, parameter_dir)
            branch_lengths[mtype]['mean'] = mutehist.get_mean()

            mutehist.normalize(
                include_overflows=False, expect_overflows=True
            )  # if it was written with overflows included, it'll need to be renormalized
            check_sum = 0.0
            for ibin in range(1, mutehist.n_bins +
                              1):  # ignore under/overflow bins
                freq = mutehist.get_bin_centers()[ibin]
                branch_length = self.convert_observed_changes_to_branch_length(
                    float(freq))
                prob = mutehist.bin_contents[ibin]
                branch_lengths[mtype]['lengths'].append(branch_length)
                branch_lengths[mtype]['probs'].append(prob)
                check_sum += branch_lengths[mtype]['probs'][-1]
            if not utils.is_normed(check_sum):
                raise Exception('not normalized %f' % check_sum)

        return branch_lengths
Esempio n. 3
0
    def get_rescaled_trees(self, treestr, branch_length_ratios, debug=False):
        """
        Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically
        the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for
        v, d, and j
        """
        rescaled_trees = {}
        if debug:
            print '      rescaling tree:'
        for region in utils.regions:
            # rescale the tree
            rescaled_trees[region] = treegenerator.rescale_tree(
                treestr, branch_length_ratios[region])
            if debug:
                print '         %s by %f (new depth %f): %s -> %s' % (
                    region, branch_length_ratios[region],
                    treegenerator.get_leaf_node_depths(
                        rescaled_trees[region])['t1'], treestr,
                    rescaled_trees[region])

            # and then check it NOTE can remove this eventually
            initial_depths = {}
            for node, depth in treegenerator.get_leaf_node_depths(
                    treestr).items():
                initial_depths[node] = depth
            for node, depth in treegenerator.get_leaf_node_depths(
                    rescaled_trees[region]).items():
                depth_ratio = depth / initial_depths[node]
                assert utils.is_normed(depth_ratio /
                                       branch_length_ratios[region],
                                       this_eps=1e-6)
        return rescaled_trees
    def read_insertion_content(self, insertion):
        self.insertion_content_probs[insertion] = {}
        if self.args.insertion_base_content:
            with opener('r')(self.indir + '/' + insertion +
                             '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    self.insertion_content_probs[insertion][line[
                        insertion + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                for nuke in utils.nukes:
                    if nuke not in self.insertion_content_probs[insertion]:
                        print '    %s not in insertion content probs, adding with zero' % nuke
                        self.insertion_content_probs[insertion][nuke] = 0
                    self.insertion_content_probs[insertion][nuke] /= float(
                        total)
        else:
            self.insertion_content_probs[insertion] = {
                'A': 0.25,
                'C': 0.25,
                'G': 0.25,
                'T': 0.25
            }

        assert utils.is_normed(self.insertion_content_probs[insertion])
        if self.args.debug:
            print '  insertion content for', insertion, self.insertion_content_probs[
                insertion]
    def read_vdj_version_freqs(self, fname):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        with opener('r')(fname) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:
                # NOTE do *not* assume the file is sorted
                #
                # if int(line['cdr3_length']) == -1:
                #     continue  # couldn't find conserved codons when we were inferring things
                if self.args.only_genes != None:  # are we restricting ourselves to a subset of genes?
                    if line['v_gene'] not in self.args.only_genes:
                        continue  # oops, don't change this to a loop, 'cause you won't continue out of the right thing then
                    if line['d_gene'] not in self.args.only_genes: continue
                    if line['j_gene'] not in self.args.only_genes: continue
                total += float(line['count'])
                index = tuple(line[column] for column in utils.index_columns)
                assert index not in self.version_freq_table
                self.version_freq_table[index] = float(line['count'])

        if len(self.version_freq_table) == 0:
            print 'ERROR didn\'t find any matching gene combinations'
            assert False

        # then normalize
        test_total = 0.0
        for index in self.version_freq_table:
            self.version_freq_table[index] /= total
            test_total += self.version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(
            self.version_freq_table
        ) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
Esempio n. 6
0
    def read_vdj_version_freqs(self):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        if self.args.rearrange_from_scratch:
            return None

        version_freq_table = {}
        with open(self.reco_parameter_dir + '/' + utils.get_parameter_fname('all', 'r')) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:  # NOTE do *not* assume the file is sorted
                skip = False
                for region in utils.regions:
                    if line[region + '_gene'] not in self.glfo['seqs'][region]:
                        skip = True
                        break
                if skip:
                    continue
                total += float(line['count'])
                index = self.freqtable_index(line)
                assert index not in version_freq_table
                version_freq_table[index] = float(line['count'])

        if len(version_freq_table) == 0:
            raise Exception('didn\'t find any gene combinations in %s' % fname)

        # then normalize
        test_total = 0.0
        for index in version_freq_table:
            version_freq_table[index] /= total
            test_total += version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(version_freq_table) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
        return version_freq_table
Esempio n. 7
0
    def read_insertion_content(self, insertion):
        icontentprobs = {}  # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function
        if insertion in utils.boundaries:  # i.e. if it's a real insertion
            with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    icontentprobs[line[insertion + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])

                if total == 0. and self.debug:
                    print '\n    WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv'
                for nuke in utils.nukes:
                    if total == 0.:
                        icontentprobs[nuke] = 1. / len(utils.nukes)
                    else:
                        if nuke not in icontentprobs:
                            print '    %s not in insertion content probs, adding with zero' % nuke
                            icontentprobs[nuke] = 0
                        icontentprobs[nuke] /= float(total)
        else:  # just return uniform probs for effective (fv and jf) insertions
            icontentprobs = {n : 0.25 for n in utils.nukes}

        assert utils.is_normed(icontentprobs)

        return icontentprobs
Esempio n. 8
0
    def read_mute_freqs(self, parameter_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        branch_lengths = {}
        for mtype in ['all',] + utils.regions:
            branch_lengths[mtype] = {n : [] for n in ('lengths', 'probs')}
            mutehist = self.get_mute_hist(mtype, parameter_dir)
            branch_lengths[mtype]['mean'] = mutehist.get_mean()

            mutehist.normalize(include_overflows=False, expect_overflows=True)  # if it was written with overflows included, it'll need to be renormalized
            check_sum = 0.0
            for ibin in range(1, mutehist.n_bins + 1):  # ignore under/overflow bins
                freq = mutehist.get_bin_centers()[ibin]
                branch_length = self.convert_observed_changes_to_branch_length(float(freq))
                prob = mutehist.bin_contents[ibin]
                branch_lengths[mtype]['lengths'].append(branch_length)
                branch_lengths[mtype]['probs'].append(prob)
                check_sum += branch_lengths[mtype]['probs'][-1]
            if not utils.is_normed(check_sum):
                raise Exception('not normalized %f' % check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in ['all',] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (mtype, branch_lengths[mtype]['mean'], branch_lengths[mtype]['mean'] / branch_lengths['all']['mean'])

        return branch_lengths
Esempio n. 9
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes is None:
            approved_genes = [this_gene, ]
        eprobs = {}
        genes_used = set()
        for erosion in utils.all_erosions:
            if erosion[0] != self.region:
                continue
            eprobs[erosion] = {}
            if this_gene == glutils.dummy_d_genes[self.args.locus]:
                eprobs[erosion][0] = 1.  # always erode zero bases
                continue
            deps = utils.column_dependencies[erosion + '_del']
            with open(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps), 'r') as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + '_del']) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + '_del'])
                    if n_eroded not in eprobs[erosion]:
                        eprobs[erosion][n_eroded] = 0.0
                    eprobs[erosion][n_eroded] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(eprobs[erosion]) == 0:
                raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)))

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if erosion in utils.real_erosions:  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(eprobs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in eprobs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in eprobs[erosion]:
                eprobs[erosion][n_eroded] /= total
                test_total += eprobs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1 and self.debug:  # if length is 1, we will have just used the actual gene
            print '    used erosion info from:', ' '.join(genes_used)

        return eprobs
Esempio n. 10
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes is None:
            approved_genes = [this_gene, ]
        eprobs = {}
        genes_used = set()
        for erosion in utils.real_erosions + utils.effective_erosions:
            if erosion[0] != self.region:
                continue
            eprobs[erosion] = {}
            if this_gene == glutils.dummy_d_genes[self.args.chain]:
                eprobs[erosion][0] = 1.  # always erode zero bases
                continue
            deps = utils.column_dependencies[erosion + '_del']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + '_del']) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + '_del'])
                    if n_eroded not in eprobs[erosion]:
                        eprobs[erosion][n_eroded] = 0.0
                    eprobs[erosion][n_eroded] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(eprobs[erosion]) == 0:
                raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)))

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if erosion in utils.real_erosions:  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(eprobs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in eprobs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in eprobs[erosion]:
                eprobs[erosion][n_eroded] /= total
                test_total += eprobs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1 and self.debug:  # if length is 1, we will have just used the actual gene
            print '    used erosion info from:', ' '.join(genes_used)

        return eprobs
Esempio n. 11
0
    def read_insertion_info(self, this_gene, approved_genes=None):
        if approved_genes == None:  # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm
            approved_genes = [this_gene,]

        genes_used = set()
        for insertion in self.insertions:
            self.insertion_probs[insertion] = {}
            deps = utils.column_dependencies[insertion + '_insertion']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in self.insertion_probs[insertion]:
                        self.insertion_probs[insertion][n_inserted] = 0.0
                    self.insertion_probs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            assert len(self.insertion_probs[insertion]) > 0

            # print '   interpolate insertions'
            interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps)  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.args.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                self.insertion_probs[insertion][0] = 1
                self.insertion_probs[insertion][1] = 1
                if self.args.debug:
                    print '      ', self.insertion_probs[insertion]

            assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in self.insertion_probs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in self.insertion_probs[insertion]:
                self.insertion_probs[insertion][n_inserted] /= total
                test_total += self.insertion_probs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion]
                assert False

            # self.insertion_content_probs = {}
            self.read_insertion_content(insertion)  # also read the base content of the insertions

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print '    insertions used:', ' '.join(genes_used)
Esempio n. 12
0
    def read_insertion_content(self, insertion):
        self.insertion_content_probs[insertion] = {}
        if insertion in utils.boundaries:  # just return uniform probs for fv and jf insertions
            with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                if total == 0.:
                    print '\n    WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv'
                for nuke in utils.nukes:
                    if total == 0.:
                        self.insertion_content_probs[insertion][nuke] = 1. / len(utils.nukes)
                    else:
                        if nuke not in self.insertion_content_probs[insertion]:
                            print '    %s not in insertion content probs, adding with zero' % nuke
                            self.insertion_content_probs[insertion][nuke] = 0
                        self.insertion_content_probs[insertion][nuke] /= float(total)
        else:
            self.insertion_content_probs[insertion] = {n : 0.25 for n in utils.nukes}

        assert utils.is_normed(self.insertion_content_probs[insertion])
        if self.args.debug:
            print '  insertion content for', insertion, self.insertion_content_probs[insertion]
Esempio n. 13
0
 def check_tree_lengths(self, treefname, ages):
     treestrs = []
     with opener('r')(treefname) as treefile:
         for line in treefile:
             treestrs.append(line.split(';')[0] +
                             ';')  # ignore the info I added after the ';'
     if self.args.debug > 1:
         print '  checking branch lengths... '
     assert len(treestrs) == len(ages)
     total_length, total_leaves = 0.0, 0
     for itree in range(len(ages)):
         if self.args.debug > 1:
             print '    asked for', ages[itree],
         for name, depth in get_leaf_node_depths(treestrs[itree]).items():
             if self.args.debug > 1:
                 print '%s:%f' % (name, depth),
             if not utils.is_normed(depth / ages[itree], this_eps=1e-6):
                 raise Exception(
                     'asked for branch length %f but got %f\n   %s' %
                     (ages[itree], depth, treestrs[itree])
                 )  # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision
         total_length += ages[itree]
         total_leaves += len(re.findall('t', treestrs[itree]))
         if self.args.debug > 1:
             print ''
     if self.args.debug:
         print '    mean branch length %.5f' % (total_length / len(ages))
         print '    mean n leaves %.2f' % (float(total_leaves) / len(ages))
Esempio n. 14
0
 def normalize(self,
               include_overflows=True,
               expect_empty=False,
               expect_overflows=False,
               overflow_eps_to_ignore=1e-15):
     sum_value = self.integral(include_overflows)
     imin, imax = self.get_bounds(include_overflows)
     if sum_value == 0.0:
         return
     if sum_value == 0.0:
         if not expect_empty:
             print 'WARNING sum zero in Hist::normalize()'
         return
     if not expect_overflows and not include_overflows and (
             self.bin_contents[0] / sum_value > overflow_eps_to_ignore
             or self.bin_contents[self.n_bins + 1] / sum_value >
             overflow_eps_to_ignore):
         print 'WARNING under/overflows in Hist::normalize()'
     for ib in range(imin, imax):
         self.bin_contents[ib] /= sum_value
         if self.sum_weights_squared is not None:
             self.sum_weights_squared[ib] /= sum_value * sum_value
         if self.errors is not None:
             self.errors[ib] /= sum_value
     check_sum = 0.0
     for ib in range(imin, imax):  # check it
         check_sum += self.bin_contents[ib]
     if not is_normed(check_sum, this_eps=1e-10):
         raise Exception('not normalized: %f' % check_sum)
Esempio n. 15
0
    def read_vdj_version_freqs(self, fname):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        with opener('r')(fname) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:
                # NOTE do *not* assume the file is sorted
                #
                # if int(line['cdr3_length']) == -1:
                #     continue  # couldn't find conserved codons when we were inferring things
                if self.args.only_genes is not None:  # are we restricting ourselves to a subset of genes?
                    if line['v_gene'] not in self.args.only_genes:
                        continue
                    if line['d_gene'] not in self.args.only_genes:
                        continue
                    if line['j_gene'] not in self.args.only_genes:
                        continue
                total += float(line['count'])
                index = tuple(line[column] for column in utils.index_columns)
                assert index not in self.version_freq_table
                self.version_freq_table[index] = float(line['count'])

        if len(self.version_freq_table) == 0:
            print 'ERROR didn\'t find any matching gene combinations'
            assert False

        # then normalize
        test_total = 0.0
        for index in self.version_freq_table:
            self.version_freq_table[index] /= total
            test_total += self.version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(self.version_freq_table) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
Esempio n. 16
0
 def normalize(
     self,
     overflow_warn=True
 ):  # since when you normalize hists you have to make the arbitrary decision whether you're going to include the under/overflow bins (we don't include them here), in general we prefer to avoid having under/overflow entries
     """ NOTE does not multiply/divide by bin widths """
     sum_value = 0.0
     for ib in range(1, self.n_bins + 1):  # don't include under/overflows
         sum_value += self.bin_contents[ib]
     if sum_value == 0.0:
         print 'WARNING sum zero in Hist::normalize(), returning without doing anything'
         return
     # make sure there's not too much stuff in the under/overflows
     if overflow_warn and (
             self.bin_contents[0] / sum_value > 1e-10
             or self.bin_contents[self.n_bins + 1] / sum_value > 1e-10):
         print 'WARNING under/overflows in Hist::normalize()'
     for ib in range(1, self.n_bins + 1):
         self.bin_contents[ib] /= sum_value
         if self.sum_weights_squared is not None:
             self.sum_weights_squared[ib] /= sum_value * sum_value
         if self.errors is not None:
             self.errors[ib] /= sum_value
     check_sum = 0.0
     for ib in range(1, self.n_bins + 1):  # check it
         check_sum += self.bin_contents[ib]
     assert is_normed(check_sum, this_eps=1e-10)
Esempio n. 17
0
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in ['all',] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
            mutehist = Hist(fname=infname)
            self.branch_lengths[mtype]['mean'] = mutehist.get_mean()

            # if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
            #     print 'WARNING nonzero under/overflow bins read from %s' % infname
            mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2)  # if it was written with overflows included, it'll need to be renormalized
            check_sum = 0.0
            for ibin in range(1, mutehist.n_bins + 1):  # ignore under/overflow bins
                freq = mutehist.get_bin_centers()[ibin]
                branch_length = self.convert_observed_changes_to_branch_length(float(freq))
                prob = mutehist.bin_contents[ibin]
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            if not utils.is_normed(check_sum):
                raise Exception('not normalized %f' % check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in ['all',] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
    def read_insertion_content(self):
        self.insertion_content_probs = {}
        for bound in utils.boundaries:
            self.insertion_content_probs[bound] = {}
            if self.args.insertion_base_content:
                with opener('r')(self.args.parameter_dir + '/' + bound +
                                 '_insertion_content.csv') as icfile:
                    reader = csv.DictReader(icfile)
                    total = 0
                    for line in reader:
                        self.insertion_content_probs[bound][line[
                            bound + '_insertion_content']] = int(line['count'])
                        total += int(line['count'])
                    for nuke in utils.nukes:
                        if nuke not in self.insertion_content_probs[bound]:
                            print '    %s not in insertion content probs, adding with zero' % nuke
                            self.insertion_content_probs[bound][nuke] = 0
                        self.insertion_content_probs[bound][nuke] /= float(
                            total)
            else:
                self.insertion_content_probs[bound] = {
                    'A': 0.25,
                    'C': 0.25,
                    'G': 0.25,
                    'T': 0.25
                }

            assert utils.is_normed(self.insertion_content_probs[bound])
Esempio n. 19
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes == None:
            approved_genes = [this_gene]
        genes_used = set()
        for erosion in utils.real_erosions + utils.effective_erosions:
            if erosion[0] != self.region:
                continue
            self.erosion_probs[erosion] = {}
            deps = utils.column_dependencies[erosion + "_del"]
            with opener("r")(
                self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps)
            ) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if (
                        self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes
                    ):  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + "_del"]) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + "_del"])
                    if n_eroded not in self.erosion_probs[erosion]:
                        self.erosion_probs[erosion][n_eroded] = 0.0
                    self.erosion_probs[erosion][n_eroded] += float(line["count"])

                    if self.region + "_gene" in line:
                        genes_used.add(line[self.region + "_gene"])

            assert len(self.erosion_probs[erosion]) > 0

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if (
                erosion in utils.real_erosions
            ):  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(self.erosion_probs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in self.erosion_probs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in self.erosion_probs[erosion]:
                self.erosion_probs[erosion][n_eroded] /= total
                test_total += self.erosion_probs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print "    erosions used:", " ".join(genes_used)
Esempio n. 20
0
    def read_vdj_version_freqs(self):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        if self.args.rearrange_from_scratch:
            return None

        version_freq_table = {}
        with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:  # NOTE do *not* assume the file is sorted
                skip = False
                for region in utils.regions:
                    if line[region + '_gene'] not in self.glfo['seqs'][region]:
                        skip = True
                        break
                if skip:
                    continue
                total += float(line['count'])
                index = self.freqtable_index(line)
                assert index not in version_freq_table
                version_freq_table[index] = float(line['count'])

        if len(version_freq_table) == 0:
            raise Exception('didn\'t find any gene combinations in %s' % fname)

        # then normalize
        test_total = 0.0
        for index in version_freq_table:
            version_freq_table[index] /= total
            test_total += version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(version_freq_table) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
        return version_freq_table
Esempio n. 21
0
    def read_insertion_content(self, insertion):
        icontentprobs = {
        }  # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function
        if insertion in utils.boundaries:  # i.e. if it's a real insertion
            with open(self.indir + '/' + insertion + '_insertion_content.csv',
                      'r') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    icontentprobs[line[insertion +
                                       '_insertion_content']] = int(
                                           line['count'])
                    total += int(line['count'])

                if total == 0. and self.debug:
                    print '\n    WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv'
                for nuke in utils.nukes:
                    if total == 0.:
                        icontentprobs[nuke] = 1. / len(utils.nukes)
                    else:
                        if nuke not in icontentprobs:
                            print '    %s not in insertion content probs, adding with zero' % nuke
                            icontentprobs[nuke] = 0
                        icontentprobs[nuke] /= float(total)
        else:  # just return uniform probs for effective (fv and jf) insertions
            icontentprobs = {n: 0.25 for n in utils.nukes}

        assert utils.is_normed(icontentprobs)

        return icontentprobs
Esempio n. 22
0
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in ['all',] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
            mutehist = plotting.make_hist_from_bin_entry_file(infname, mtype+'-mute-freqs')
            self.branch_lengths[mtype]['mean'] = mutehist.GetMean()

            if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
                print 'WARNING nonzero under/overflow bins read from %s' % infname

            check_sum = 0.0
            for ibin in range(1, mutehist.GetNbinsX()+1):  # ignore under/overflow bins
                freq = mutehist.GetBinCenter(ibin)
                branch_length = float(freq)
                prob = mutehist.GetBinContent(ibin)
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            assert utils.is_normed(check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in ['all',] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
Esempio n. 23
0
    def read_insertion_content(self):
        if self.args.rearrange_from_scratch:
            return {
                b: {n: 1. / len(utils.nukes)
                    for n in utils.nukes}
                for b in utils.boundaries
            }

        insertion_content_probs = {}
        for bound in utils.boundaries:
            insertion_content_probs[bound] = {}
            with open(
                    self.parameter_dir + '/' + bound +
                    '_insertion_content.csv', 'r') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    insertion_content_probs[bound][line[
                        bound + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                for nuke in utils.nukes:
                    if nuke not in insertion_content_probs[bound]:
                        print '    %s not in insertion content probs, adding with zero' % nuke
                        insertion_content_probs[bound][nuke] = 0
                    insertion_content_probs[bound][nuke] /= float(total)

            assert utils.is_normed(insertion_content_probs[bound])

        return insertion_content_probs
Esempio n. 24
0
    def read_insertion_info(self, this_gene, approved_genes=None):
        if approved_genes == None:  # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm
            approved_genes = [this_gene,]

        genes_used = set()
        for insertion in self.insertions:
            self.insertion_probs[insertion] = {}
            deps = utils.column_dependencies[insertion + '_insertion']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in self.insertion_probs[insertion]:
                        self.insertion_probs[insertion][n_inserted] = 0.0
                    self.insertion_probs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            assert len(self.insertion_probs[insertion]) > 0

            # print '   interpolate insertions'
            interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps)  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.args.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                self.insertion_probs[insertion][0] = 1
                self.insertion_probs[insertion][1] = 1
                if self.args.debug:
                    print '      ', self.insertion_probs[insertion]

            assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in self.insertion_probs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in self.insertion_probs[insertion]:
                self.insertion_probs[insertion][n_inserted] /= total
                test_total += self.insertion_probs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion]
                assert False

            # self.insertion_content_probs = {}
            self.read_insertion_content(insertion)  # also read the base content of the insertions

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print '    insertions used:', ' '.join(genes_used)
Esempio n. 25
0
    def write_mute_freqs(self, region, gene_or_insert_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_or_insert_name>, renormalize, then write to a file for bppseqgen. """
        mute_freqs = self.get_mute_freqs(gene_or_insert_name)

        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + left_erosion_length
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
Esempio n. 26
0
    def check(self):
        total = 0.0
        for _, prob in self.transitions.iteritems():
            assert prob >= 0.0
            total += prob
        if not utils.is_normed(total):
            raise Exception('transition probs not normed in %s: %s' % (self.name, self.transitions))

        if self.name == 'init':  # no emissions for 'init' state
            return

        if self.emissions is not None:
            total = 0.0
            for _, prob in self.emissions['probs'].iteritems():
                assert prob >= 0.0
                total += prob
            assert utils.is_normed(total)
Esempio n. 27
0
    def check(self):
        total = 0.0
        for _, prob in self.transitions.iteritems():
            assert prob >= 0.0
            total += prob
        if not utils.is_normed(total):
            raise Exception('transition probs not normed in %s: %s' % (self.name, self.transitions))

        if self.name == 'init':  # no emissions for 'init' state
            return

        if self.emissions is not None:
            total = 0.0
            for _, prob in self.emissions['probs'].iteritems():
                assert prob >= 0.0
                total += prob
            assert utils.is_normed(total)
Esempio n. 28
0
    def add_region_entry_transitions(self, state, insertion):
        """
        Add transitions *into* the v, d, or j regions. Called on either the 'init' state or the 'insert_left' state.
        For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
        For d and j, this is (mostly) the prob to actually erode on the left side.
        The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
        """
        assert 'jf' not in insertion  # need these to only be *left*-hand insertions
        assert state.name == 'init' or 'insert' in state.name

        # first add transitions to the insert state
        region_entry_prob = 0.0  # Prob to go to an internal germline state (i.e. not to an insert state)
        if state.name == 'init':
            if insertion == '':
                region_entry_prob = 1.0  # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
            else:
                region_entry_prob = self.get_zero_length_insertion_prob(insertion)  # prob of entering the region from 'init' is the prob of a zero-length insertion
        elif 'insert' in state.name:
            region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion)  # the 'insert_left' state has to either go to itself, or else enter the region
        else:
            assert False

        # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
        # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
        # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
        if insertion != '' and region_entry_prob < 1.0:
            if not insertion in utils.boundaries:
                nukelist = ['N', ]
            else:
                nukelist = utils.nukes
            for nuke in nukelist:
                content_prob = 1. if nuke == 'N' else self.insertion_content_probs[insertion][nuke]
                state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * content_prob)

        # then add transitions to the region's internal states
        total = 0.0
        if self.region == 'v':  # only add a transition to the zeroth internal state
            state.add_transition('%s_%d' % (self.saniname, 0), region_entry_prob)
            total += region_entry_prob
            self.smallest_entry_index = 0
        else:
            erosion = self.region + '_5p'
            for inuke in range(len(self.germline_seq)):
                erosion_length = inuke
                if erosion_length in self.erosion_probs[erosion]:
                    prob = self.erosion_probs[erosion][erosion_length]
                    total += prob * region_entry_prob
                    if region_entry_prob != 0.0:  # only add the line if there's a chance of entering the region from this state
                        state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob)
                        if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index:  # tells us where we need to start adding internal states (the smallest internal state index we add is the first one that has nonzero transition probability here)
                            self.smallest_entry_index = inuke
                    else:
                        assert state.name == 'init' or self.raw_name == glutils.dummy_d_genes[self.args.locus]  # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state (UPDATE: or, it can be the dummy d)

        if region_entry_prob != 0.0 and not utils.is_normed(total / region_entry_prob):
            raise Exception('normalization problem in add_region_entry_transitions():\n  region_entry_prob: %f   total / region_entry_prob: %f' % (region_entry_prob, total / region_entry_prob))
Esempio n. 29
0
    def add_region_entry_transitions(self, state, insertion):
        """
        Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state.
        For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
        For d and j, this is (mostly) the prob to actually erode on the left side.
        The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
        """
        assert 'jf' not in insertion  # need these to only be *left*-hand insertions
        assert state.name == 'init' or 'insert' in state.name

        # first add transitions to the insert state
        region_entry_prob = 0.0  # Prob to go to an internal germline state (i.e. not to an insert state)
        if state.name == 'init':
            if insertion == '':
                region_entry_prob = 1.0  # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
            else:
                region_entry_prob = self.get_zero_length_insertion_prob(insertion)  # prob of entering the region from 'init' is the prob of a zero-length insertion
        elif 'insert' in state.name:
            region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion)  # the 'insert_left' state has to either go to itself, or else enter the region
        else:
            assert False

        # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
        # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
        # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
        if insertion != '':
            if not insertion in utils.boundaries:
                nukelist = ['N', ]
            else:
                nukelist = utils.nukes
            for nuke in nukelist:
                content_prob = 1. if nuke == 'N' else self.insertion_content_probs[insertion][nuke]
                state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * content_prob)

        # then add transitions to the region's internal states
        total = 0.0
        if self.region == 'v':  # only add a transition to the zeroth internal state
            state.add_transition('%s_%d' % (self.saniname, 0), region_entry_prob)
            total += region_entry_prob
            self.smallest_entry_index = 0
        else:
            erosion = self.region + '_5p'
            for inuke in range(len(self.germline_seq)):
                erosion_length = inuke
                if erosion_length in self.erosion_probs[erosion]:
                    prob = self.erosion_probs[erosion][erosion_length]
                    total += prob * region_entry_prob
                    if region_entry_prob != 0.0:  # only add the line if there's a chance of entering the region from this state
                        state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob)
                        if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index:  # tells us where we need to start adding internal states (the smallest internal state index we add is the first one that has nonzero transition probability here)
                            self.smallest_entry_index = inuke
                    else:
                        assert state.name == 'init'  # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state

        if region_entry_prob != 0.0 and not utils.is_normed(total / region_entry_prob):
            raise Exception('normalization problem in add_region_entry_transitions():\n  region_entry_prob: %f   total / region_entry_prob: %f' % (region_entry_prob, total / region_entry_prob))
Esempio n. 30
0
def choose_allele_prevalence_freqs(glfo, allele_prevalence_freqs, region, min_allele_prevalence_freq, debug=False):
    n_alleles = len(glfo['seqs'][region])
    prevalence_counts = numpy.random.randint(1, int(1. / min_allele_prevalence_freq), size=n_alleles)  # ensures that each pair of alleles has a prevalence ratio between <min_allele_prevalence_freq> and 1. NOTE it's inclusive
    prevalence_freqs = [float(c) / sum(prevalence_counts) for c in prevalence_counts]
    allele_prevalence_freqs[region] = {g : f for g, f in zip(glfo['seqs'][region].keys(), prevalence_freqs)}
    assert utils.is_normed(allele_prevalence_freqs[region])
    if debug:
        print '      counts %s' % ' '.join([('%5d' % c) for c in prevalence_counts])
        print '       freqs %s' % ' '.join([('%5.3f' % c) for c in prevalence_freqs])
        print '   min ratio %.3f' % (min(prevalence_freqs) / max(prevalence_freqs))
Esempio n. 31
0
    def add_region_entry_transitions(self, state, insertion):
        """
        Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state.
        For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
        For d and j, this is (mostly) the prob to actually erode on the left side.
        The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
        """
        assert 'jf' not in insertion  # need these to only be *left*-hand insertions
        assert state.name == 'init' or 'insert' in state.name

        region_entry_prob = 0.0  # Prob to go directly into the region (i.e. with no insertion)
        # The sum of the region entry probs must be (1 - non_zero_insertion_prob) for d and j
        # (i.e. such that [prob of transitions to insert] + [prob of transitions *not* to insert] is 1.0)

        # first add transitions to the insert state
        if state.name == 'init':
            if insertion == '':
                region_entry_prob = 1.0  # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
            else:
                region_entry_prob = self.insertion_probs[insertion][
                    0]  # prob of entering the region from 'init' is the prob of a zero-length insertion
        elif 'insert' in state.name:
            region_entry_prob = 1.0 - self.get_insert_self_transition_prob(
                insertion
            )  # the 'insert_left' state has to either go to itself, or else enter the region
        else:
            assert False

        # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
        # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
        # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
        if insertion != '':
            for nuke in utils.nukes:
                state.add_transition(
                    'insert_left_' + nuke, (1.0 - region_entry_prob) *
                    self.insertion_content_probs[insertion][nuke])

        # then add transitions to the region's internal states
        erosion = self.region + '_5p'
        total = 0.0
        for inuke in range(len(self.germline_seq)):
            erosion_length = inuke
            if erosion_length in self.erosion_probs[erosion]:
                prob = self.erosion_probs[erosion][erosion_length]
                total += prob * region_entry_prob
                if region_entry_prob != 0.0:  # only add the line if there's a chance of entering the region from this state
                    state.add_transition('%s_%d' % (self.saniname, inuke),
                                         prob * region_entry_prob)
                    if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index:
                        self.smallest_entry_index = inuke
                else:
                    assert state.name == 'init'  # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state

        assert region_entry_prob == 0.0 or utils.is_normed(
            total / region_entry_prob)
Esempio n. 32
0
def choose_allele_prevalence_freqs(glfo, allele_prevalence_freqs, region, min_allele_prevalence_freq, debug=False):
    n_alleles = len(glfo["seqs"][region])
    prevalence_counts = numpy.random.randint(
        1, int(1.0 / min_allele_prevalence_freq), size=n_alleles
    )  # ensures that each pair of alleles has a prevalence ratio between <min_allele_prevalence_freq> and 1. NOTE it's inclusive
    prevalence_freqs = [float(c) / sum(prevalence_counts) for c in prevalence_counts]
    allele_prevalence_freqs[region] = {g: f for g, f in zip(glfo["seqs"][region].keys(), prevalence_freqs)}
    assert utils.is_normed(allele_prevalence_freqs[region])
    if debug:
        print "      counts %s" % " ".join([("%5d" % c) for c in prevalence_counts])
        print "       freqs %s" % " ".join([("%5.3f" % c) for c in prevalence_freqs])
        print "   min ratio %.3f" % (min(prevalence_freqs) / max(prevalence_freqs))
Esempio n. 33
0
    def set_branch_lengths(self, parameter_dir):
        self.branch_lengths = {}
        for mtype in ['all'] + utils.regions:
            hist = self.get_mute_hist(mtype, parameter_dir)
            hist.normalize(include_overflows=False, expect_overflows=True)  # if it was written with overflows included, it'll need to be renormalized
            lengths, probs = [], []
            for ibin in range(1, hist.n_bins + 1):  # ignore under/overflow bins
                freq = hist.get_bin_centers()[ibin]
                lengths.append(self.convert_observed_changes_to_branch_length(float(freq)))
                probs.append(hist.bin_contents[ibin])
            self.branch_lengths[mtype] = {'mean' : hist.get_mean(), 'lengths' : lengths, 'probs' : probs}

            if not utils.is_normed(probs):
                raise Exception('not normalized %f' % check_sum)
Esempio n. 34
0
    def write_mute_freqs(
        self, gene, seq, reco_event, reco_seq_fname
    ):  # TODO unsurprisingly, this function profiles out to be kind of a dumb way to do it, in terms of run time
        """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """
        mute_freqs = self.get_mute_freqs(gene)

        rates = [
        ]  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        left_erosion_length = dict(reco_event.erosions.items() +
                                   reco_event.effective_erosions.items())[
                                       utils.get_region(gene) + '_5p']
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + left_erosion_length
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(
            seq
        )  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with open(reco_seq_fname, 'w') as reco_seq_file:
            # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out)
            headstr = 'state'
            if not self.args.mutate_from_scratch:
                headstr += '\trate'
            reco_seq_file.write(headstr + '\n')
            for inuke in range(len(seq)):
                linestr = seq[inuke]
                if not self.args.mutate_from_scratch:
                    linestr += '\t%f' % rates[inuke]
                reco_seq_file.write(linestr + '\n')
Esempio n. 35
0
def simulate(args):
    if utils.output_exists(args, args.simfname):
        return
    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
    if args.n_leaf_distribution is None:
        cmd_str += ' --constant-number-of-leaves'
    else:
        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
    if args.mut_mult is not None:
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
    if args.root_mrca_weibull_parameter is not None:
        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm --subsimproc'

    allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'

    # figure what genes we're using
    if args.gls_gen:
        assert args.sim_v_genes is None and args.allele_prevalence_freqs is None

        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
        glutils.remove_v_genes_with_bad_cysteines(sglfo)
        glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
        cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname
    else:
        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set

        if args.allele_prevalence_freqs is not None:
            if not utils.is_normed(args.allele_prevalence_freqs):
                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
            glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname)
            cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname

    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
    # glutils.print_glfo(sglfo)

    # run simulation
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
Esempio n. 36
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r: {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print "%14.8f   %s" % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
Esempio n. 37
0
    def check(self):
        total = 0.0
        for _, prob in self.transitions.iteritems():
            assert prob >= 0.0
            total += prob
        assert utils.is_normed(total)

        if self.name == 'init':  # no emissions for 'init' state
            return

        if self.emissions is not None:
            total = 0.0
            for _, prob in self.emissions['probs'].iteritems():
                assert prob >= 0.0
                total += prob
            assert utils.is_normed(total)

        if self.pair_emissions is not None:
            total = 0.0
            for letter1 in self.pair_emissions['probs']:
                for _, prob in self.pair_emissions['probs'][letter1].iteritems():
                    assert prob >= 0.0
                    total += prob
            assert utils.is_normed(total)
Esempio n. 38
0
def rescale_tree(treestr, new_height, debug=False):
    """ rescale the branch lengths in <treestr> (newick-formatted) by <factor> """
    tree = get_btree(treestr)
    mean_height = get_mean_height(treestr)
    for ln in tree.Objects:
        old_length = ln.length
        ln.length *= new_height / mean_height  # rescale every branch length in the tree by the ratio of desired to existing height (everybody's heights should be the same... but they never quite were when I was using Bio.Phylo, so, uh. yeah, uh. not sure what to do, but this is fine. It's checked below, anyway)
        if debug:
            print '  %5s  %7e  -->  %7e' % (ln.numName if ln.branchType == 'leaf' else ln.branchType, old_length, ln.length)
    tree.traverse_tree()
    treestr = tree.toString(numName=True)
    for leaf in get_btree(treestr).leaves:  # make sure string conversion (and rescaling) went ok
        if not utils.is_normed(leaf.height / new_height, this_eps=1e-8):
            raise Exception('tree not rescaled properly:   %.10f   %.10f    %e' % (leaf.height, new_height, (leaf.height - new_height) / new_height))
    return treestr
Esempio n. 39
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r : {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq'])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print '%14.8f   %s' % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
Esempio n. 40
0
    def add_region_entry_transitions(self, state, insertion):
        """
        Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state.
        For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
        For d and j, this is (mostly) the prob to actually erode on the left side.
        The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
        """
        assert 'jf' not in insertion  # need these to only be *left*-hand insertions
        assert state.name == 'init' or 'insert' in state.name

        region_entry_prob = 0.0  # Prob to go directly into the region (i.e. with no insertion)
                                 # The sum of the region entry probs must be (1 - non_zero_insertion_prob) for d and j
                                 # (i.e. such that [prob of transitions to insert] + [prob of transitions *not* to insert] is 1.0)

        # first add transitions to the insert state
        if state.name == 'init':
            if insertion == '':
                region_entry_prob = 1.0  # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
            else:
                region_entry_prob = self.insertion_probs[insertion][0]  # prob of entering the region from 'init' is the prob of a zero-length insertion
        elif 'insert' in state.name:
            region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion)  # the 'insert_left' state has to either go to itself, or else enter the region
        else:
            assert False

        # If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
        # Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
        # update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
        if insertion != '':
            for nuke in utils.nukes:
                state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * self.insertion_content_probs[insertion][nuke])

        # then add transitions to the region's internal states
        erosion = self.region + '_5p'
        total = 0.0
        for inuke in range(len(self.germline_seq)):
            erosion_length = inuke
            if erosion_length in self.erosion_probs[erosion]:
                prob = self.erosion_probs[erosion][erosion_length]
                total += prob * region_entry_prob
                if region_entry_prob != 0.0:  # only add the line if there's a chance of entering the region from this state
                    state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob)
                    if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index:
                        self.smallest_entry_index = inuke
                else:
                    assert state.name == 'init'  # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state

        assert region_entry_prob == 0.0 or utils.is_normed(total / region_entry_prob)
Esempio n. 41
0
    def read_insertion_content(self):
        self.insertion_content_probs = {}
        for bound in utils.boundaries:
            self.insertion_content_probs[bound] = {}
            with opener('r')(self.args.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    self.insertion_content_probs[bound][line[bound + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                for nuke in utils.nukes:
                    if nuke not in self.insertion_content_probs[bound]:
                        print '    %s not in insertion content probs, adding with zero' % nuke
                        self.insertion_content_probs[bound][nuke] = 0
                    self.insertion_content_probs[bound][nuke] /= float(total)

            assert utils.is_normed(self.insertion_content_probs[bound])
Esempio n. 42
0
 def normalize(self):
     sum_value = 0.0
     for ib in range(1, self.n_bins + 1):  # don't include under/overflows in sum_value
         sum_value += self.bin_contents[ib]
     if sum_value == 0.0:
         print 'WARNING sum zero in Hist::normalize, returning without doing anything'
         return
     # make sure there's not too much stuff in the under/overflows
     if self.bin_contents[0]/sum_value > 1e-10 or self.bin_contents[self.n_bins+1]/sum_value > 1e-10:
         print 'WARNING under/overflows'
     for ib in range(1, self.n_bins + 1):
         self.bin_contents[ib] /= sum_value
         if self.sum_weights_squared is not None:
             self.sum_weights_squared[ib] /= sum_value*sum_value
     check_sum = 0.0
     for ib in range(1, self.n_bins + 1):  # check it
         check_sum += self.bin_contents[ib]
     assert is_normed(check_sum, this_eps=1e-10)
Esempio n. 43
0
 def get_rescaled_trees(self, treestr, branch_length_ratios):
     """ 
     Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically
     the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for 
     v, d, and j
     """
     rescaled_trees = {}
     for region in utils.regions:
         # rescale the tree
         rescaled_trees[region] = treegenerator.rescale_tree(treestr, branch_length_ratios[region])
         # print 'rescaled %s by %f: %s -> %s' % (region, branch_length_ratios[region], treestr, rescaled_trees[region])
         # and then check it NOTE can remove this eventually
         initial_depths = {}
         for node, depth in treegenerator.get_leaf_node_depths(treestr).items():
             initial_depths[node] = depth
         for node, depth in treegenerator.get_leaf_node_depths(rescaled_trees[region]).items():
             depth_ratio = depth / initial_depths[node]
             assert utils.is_normed(depth_ratio / branch_length_ratios[region], this_eps=1e-6)
     return rescaled_trees
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
Esempio n. 45
0
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
Esempio n. 46
0
    def write_mute_freqs(self, gene, seq, reco_event, reco_seq_fname):
        """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """
        mute_freqs = self.get_mute_freqs(gene)

        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[utils.get_region(gene) + '_5p']
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + left_erosion_length
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out)
            headstr = 'state'
            if not self.args.mutate_from_scratch:
                headstr += '\trate'
            reco_seq_file.write(headstr + '\n')
            for inuke in range(len(seq)):
                linestr = seq[inuke]
                if not self.args.mutate_from_scratch:
                    linestr += '\t%f' % rates[inuke]
                reco_seq_file.write(linestr + '\n')
Esempio n. 47
0
    def read_insertion_content(self, insertion):
        self.insertion_content_probs[insertion] = {}
        if self.args.insertion_base_content:
            with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                for nuke in utils.nukes:
                    if nuke not in self.insertion_content_probs[insertion]:
                        print '    %s not in insertion content probs, adding with zero' % nuke
                        self.insertion_content_probs[insertion][nuke] = 0
                    self.insertion_content_probs[insertion][nuke] /= float(total)
        else:
            self.insertion_content_probs[insertion] = {'A':0.25, 'C':0.25, 'G':0.25, 'T':0.25}

        assert utils.is_normed(self.insertion_content_probs[insertion])
        if self.args.debug:
            print '  insertion content for', insertion, self.insertion_content_probs[insertion]
Esempio n. 48
0
    def read_insertion_content(self, insertion):
        self.insertion_content_probs[insertion] = {}
        if insertion in utils.boundaries:  # just return uniform probs for fv and jf insertions
            with opener("r")(self.indir + "/" + insertion + "_insertion_content.csv") as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    self.insertion_content_probs[insertion][line[insertion + "_insertion_content"]] = int(line["count"])
                    total += int(line["count"])
                for nuke in utils.nukes:
                    if nuke not in self.insertion_content_probs[insertion]:
                        print "    %s not in insertion content probs, adding with zero" % nuke
                        self.insertion_content_probs[insertion][nuke] = 0
                    self.insertion_content_probs[insertion][nuke] /= float(total)
        else:
            self.insertion_content_probs[insertion] = {n: 0.25 for n in utils.nukes}

        assert utils.is_normed(self.insertion_content_probs[insertion])
        if self.args.debug:
            print "  insertion content for", insertion, self.insertion_content_probs[insertion]
Esempio n. 49
0
 def check_tree_lengths(self, treefname, ages):
     treestrs = []
     with opener('r')(treefname) as treefile:
         for line in treefile:
             treestrs.append(line.split(';')[0] + ';')  # ignore the info I added after the ';'
     if self.args.debug > 1:
         print '  checking branch lengths... '
     assert len(treestrs) == len(ages)
     total = 0.0
     for itree in range(len(ages)):
         if self.args.debug > 1:
             print '    asked for', ages[itree],
         for name, depth in get_leaf_node_depths(treestrs[itree]).items():
             if self.args.debug > 1:
                 print '%s:%f' % (name, depth),
             assert utils.is_normed(depth / ages[itree], this_eps=1e-6)  # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision
         total += ages[itree]
         if self.args.debug > 1:
             print ''
     if self.args.debug:
         print '    branch lengths ok (mean %f)' % (total / len(ages))
Esempio n. 50
0
 def normalize(self, overflow_warn=True):  # since when you normalize hists you have to make the arbitrary decision whether you're going to include the under/overflow bins (we don't include them here), in general we prefer to avoid having under/overflow entries
     """ NOTE does not multiply/divide by bin widths """
     sum_value = 0.0
     for ib in range(1, self.n_bins + 1):  # don't include under/overflows
         sum_value += self.bin_contents[ib]
     if sum_value == 0.0:
         print 'WARNING sum zero in Hist::normalize(), returning without doing anything'
         return
     # make sure there's not too much stuff in the under/overflows
     if overflow_warn and (self.bin_contents[0]/sum_value > 1e-10 or self.bin_contents[self.n_bins+1]/sum_value > 1e-10):
         print 'WARNING under/overflows in Hist::normalize()'
     for ib in range(1, self.n_bins + 1):
         self.bin_contents[ib] /= sum_value
         if self.sum_weights_squared is not None:
             self.sum_weights_squared[ib] /= sum_value*sum_value
         if self.errors is not None:
             self.errors[ib] /= sum_value
     check_sum = 0.0
     for ib in range(1, self.n_bins + 1):  # check it
         check_sum += self.bin_contents[ib]
     assert is_normed(check_sum, this_eps=1e-10)
Esempio n. 51
0
    def read_insertion_content(self):
        if self.args.rearrange_from_scratch:
            return {b : {n : 1./len(utils.nukes) for n in utils.nukes} for b in utils.boundaries}

        insertion_content_probs = {}
        for bound in utils.boundaries:
            insertion_content_probs[bound] = {}
            with opener('r')(self.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile:
                reader = csv.DictReader(icfile)
                total = 0
                for line in reader:
                    insertion_content_probs[bound][line[bound + '_insertion_content']] = int(line['count'])
                    total += int(line['count'])
                for nuke in utils.nukes:
                    if nuke not in insertion_content_probs[bound]:
                        print '    %s not in insertion content probs, adding with zero' % nuke
                        insertion_content_probs[bound][nuke] = 0
                    insertion_content_probs[bound][nuke] /= float(total)

            assert utils.is_normed(insertion_content_probs[bound])

        return insertion_content_probs
Esempio n. 52
0
 def normalize(self, include_overflows=True, expect_empty=False, expect_overflows=False, overflow_eps_to_ignore=1e-15):
     sum_value = self.integral(include_overflows)
     imin, imax = self.get_bounds(include_overflows)
     if sum_value == 0.0:
         return
     if sum_value == 0.0:
         if not expect_empty:
             print 'WARNING sum zero in Hist::normalize()'
         return
     if not expect_overflows and not include_overflows and (self.bin_contents[0]/sum_value > overflow_eps_to_ignore or self.bin_contents[self.n_bins+1]/sum_value > overflow_eps_to_ignore):
         print 'WARNING under/overflows in Hist::normalize()'
     for ib in range(imin, imax):
         self.bin_contents[ib] /= sum_value
         if self.sum_weights_squared is not None:
             self.sum_weights_squared[ib] /= sum_value*sum_value
         if self.errors is not None:
             self.errors[ib] /= sum_value
     check_sum = 0.0
     for ib in range(imin, imax):  # check it
         check_sum += self.bin_contents[ib]
     if not is_normed(check_sum, this_eps=1e-10):
         raise Exception('not normalized: %f' % check_sum)
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in [
                'all',
        ] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype][
                'probs'] = [], []
            mutehist = plotting.make_hist_from_bin_entry_file(
                infname, mtype + '-mute-freqs')
            self.branch_lengths[mtype]['mean'] = mutehist.GetMean()

            if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(
                    mutehist.GetNbinsX() + 1) > 0.0:
                print 'WARNING nonzero under/overflow bins read from %s' % infname

            check_sum = 0.0
            for ibin in range(1,
                              mutehist.GetNbinsX() +
                              1):  # ignore under/overflow bins
                freq = mutehist.GetBinCenter(ibin)
                branch_length = float(freq)
                prob = mutehist.GetBinContent(ibin)
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            assert utils.is_normed(check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in [
                    'all',
            ] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (
                    mtype, self.branch_lengths[mtype]['mean'],
                    self.branch_lengths[mtype]['mean'] /
                    self.branch_lengths['all']['mean'])
 def check_tree_lengths(self, treefname, ages):
     treestrs = []
     with opener('r')(treefname) as treefile:
         for line in treefile:
             treestrs.append(line.split(';')[0] +
                             ';')  # ignore the info I added after the ';'
     if self.args.debug > 1:
         print '  checking branch lengths... '
     assert len(treestrs) == len(ages)
     total = 0.0
     for itree in range(len(ages)):
         if self.args.debug > 1:
             print '    asked for', ages[itree],
         for name, depth in get_leaf_node_depths(treestrs[itree]).items():
             if self.args.debug > 1:
                 print '%s:%f' % (name, depth),
             assert utils.is_normed(
                 depth / ages[itree], this_eps=1e-6
             )  # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision
         total += ages[itree]
         if self.args.debug > 1:
             print ''
     if self.args.debug:
         print '    branch lengths ok (mean %f)' % (total / len(ages))
Esempio n. 55
0
 def check_tree_lengths(self, treefname, ages):
     treestrs = []
     with opener('r')(treefname) as treefile:
         for line in treefile:
             treestrs.append(line.split(';')[0] + ';')  # ignore the info I added after the ';'
     if self.args.debug > 1:
         print '  checking branch lengths... '
     assert len(treestrs) == len(ages)
     total_length, total_leaves = 0.0, 0
     for itree in range(len(ages)):
         if self.args.debug > 1:
             print '    asked for', ages[itree],
         for name, depth in get_leaf_node_depths(treestrs[itree]).items():
             if self.args.debug > 1:
                 print '%s:%.8f' % (name, depth),
             if not utils.is_normed(depth / ages[itree], this_eps=1e-4):
                 raise Exception('asked for branch length %.8f but got %.8f\n   %s' % (ages[itree], depth, treestrs[itree]))  # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision
         total_length += ages[itree]
         total_leaves += len(re.findall('t', treestrs[itree]))
         if self.args.debug > 1:
             print ''
     if self.args.debug:
         print '    mean branch length %.5f' % (total_length / len(ages))
         print '    mean n leaves %.2f' % (float(total_leaves) / len(ages))
Esempio n. 56
0
            n_new_alleles = len(positions[mtype])
        if len(positions[mtype]) != n_new_alleles:
            raise Exception('mismatched number of new alleles for %s' % ' vs '.join(mtypes))
if n_new_alleles is None:
    n_new_alleles = 0
for mtype in mtypes:
    if positions[mtype] is None:  # if it wasn't specified at all, i.e. we don't want to generate any new alleles
        positions[mtype] = [[] for _ in range(n_new_alleles)]
args.new_allele_info = [{'gene' : args.sim_v_genes[igene] if not args.gls_gen else None,
                         'snp-positions' : positions['snp'][igene],
                         'indel-positions' : positions['indel'][igene]}
                        for igene in range(n_new_alleles)]

if args.allele_prevalence_freqs is not None:
    # easier to check the length after we've generated snpd genes (above)
    if not utils.is_normed(args.allele_prevalence_freqs):
        raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
if args.inf_glfo_dir is None:
    args.inf_glfo_dir = args.outdir + '/germlines/inference'
if args.simfname is None:
    args.simfname = args.outdir + '/simu.csv'

if args.seed is not None:
    random.seed(args.seed)
    numpy.random.seed(args.seed)

if args.n_tests is not None:
    multiple_tests(args)
else:
    run_tests(args)
Esempio n. 57
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'annotate'
    if args.action == 'view-alternative-naive-seqs':
        print '  note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'view-alternative-annotations'

    args.light_chain_fractions = utils.get_arg_list(args.light_chain_fractions,
                                                    key_val_pairs=True,
                                                    floatify=True)
    if args.light_chain_fractions is not None and not utils.is_normed(
            args.light_chain_fractions.values()):
        raise Exception('--light-chain-fractions %s don\'t add to 1: %f' %
                        (args.light_chain_fractions,
                         sum(args.light_chain_fractions.values())))
    if args.action == 'merge-paired-partitions':
        assert args.paired_loci
    if args.paired_loci:
        args.locus = None
        if [args.infname, args.paired_indir].count(None) == 0:
            raise Exception('can\'t specify both --infname and --paired-indir')
        if args.outfname is not None:
            raise Exception(
                'can\'t set --outfname if --paired-loci is set (use --paired-outdir)'
            )
        if args.plotdir == 'paired-outdir':
            args.plotdir = args.paired_outdir
        if args.plotdir is None and args.action == 'plot-partitions':
            args.plotdir = args.paired_outdir
    else:
        assert args.paired_indir is None
    if not args.paired_loci and (args.paired_indir is not None
                                 or args.paired_outdir is not None):
        raise Exception(
            '--paired-loci must be set if either --paired-indir or --paired-outdir is set'
        )
    if args.reverse_negative_strands and not args.paired_loci:
        raise Exception(
            '--reverse-negative-strands has no effect unless --paired-loci is set (maybe need to run bin/split-loci.py separately?)'
        )

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.cluster_indices = utils.get_arg_list(args.cluster_indices,
                                              intify_with_ranges=True)

    args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths,
                                                   intify=True)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.typical_genes_per_region_per_subject = utils.get_arg_list(
        args.typical_genes_per_region_per_subject, intify=True)
    if len(args.typical_genes_per_region_per_subject) != len(utils.regions):
        raise Exception(
            'wrong length for --typical-genes-per-region-per-subject, has to be three'
        )
    tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject
    args.min_allele_prevalence_fractions = {
        r:
        tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)]
        for r in utils.regions
    }
    delattr(args,
            'min_allele_prevalence_fraction')  # delete the non-plural version
    delattr(args, 'typical_genes_per_region_per_subject'
            )  # and we don't need this any more either

    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        utils.get_version_info(debug=True)
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.collapse_duplicate_sequences and not args.is_data:
        print '  %s collapsing duplicates on simulation, which is often not a good idea since it makes keeping track of performance harder (e.g. purity/completeness of partitions is harder to calculate)' % utils.color(
            'red', 'warning')

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )
        if args.all_seqs_simultaneous:
            raise Exception(
                'can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs'
            )
        if args.action == 'partition':
            raise Exception(
                'can\'t set --simultaneous-true-clonal-seqs when partitioning')
    if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous:
        raise Exception(
            'doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.'
        )

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' %
                            args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(
        args.n_indels_per_indeld_seq, intify=True)
    if args.indel_location not in [None, 'v', 'cdr3']:
        if int(args.indel_location) in range(500):
            args.indel_location = int(args.indel_location)
            if any(n > 1 for n in args.n_indels_per_indeld_seq):
                print '  note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [
                    n for n in args.n_indels_per_indeld_seq if n > 1
                ]
                args.n_indels_per_indeld_seq = [
                    n for n in args.n_indels_per_indeld_seq if n <= 1
                ]
        else:
            raise Exception(
                '--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500'
                % args.indel_location)

    if args.locus is not None and 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not
        args.workdir = get_workdir(args.batch_system)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output and not args.airr_output and not args.generate_trees:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' %
                            utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (
                utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'
                           ] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception(
                'have to use \'view-output\' action to view .yaml output files'
            )

    if args.presto_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --presto-output is set')
        if args.action == 'annotate' and utils.getsuffix(
                args.outfname) != '.tsv':
            raise Exception(
                '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(
                args.outfname) not in ['.fa', '.fasta']:
            raise Exception(
                '--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            assert args.locus is not None
            args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (
                args.default_initial_germline_dir, args.species, args.locus)
        if not os.path.exists(args.aligned_germline_fname):
            raise Exception(
                '--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output'
                % args.aligned_germline_fname)
    if args.airr_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --airr-output is set')
        if utils.getsuffix(args.outfname) == '.tsv':
            print '  note: writing only airr .tsv to %s' % args.outfname
        elif utils.getsuffix(args.outfname) in ['.yaml', '.csv']:
            print '  note: writing both partis %s to %s and airr .tsv to %s' % (
                utils.getsuffix(args.outfname), args.outfname,
                utils.replace_suffix(args.outfname, '.tsv'))
        else:
            raise Exception(
                '--outfname suffix has to be either .tsv or .yaml if --airr-output is set (got %s)'
                % utils.getsuffix(args.outfname))
    if args.airr_input:
        args.seq_column = 'sequence'
        args.name_column = 'sequence_id'

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(
            args.outfname
    ) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix(
            '-cluster-annotations', args.outfname)

    if args.calculate_alternative_annotations and args.outfname is None and args.paired_outdir is None:
        raise Exception(
            'have to specify --outfname in order to calculate alternative annotations'
        )
    if args.subcluster_annotation_size == 'None':  # i want it turned on by default, but also to be able to turn it off on the command line
        args.subcluster_annotation_size = None
    else:
        args.subcluster_annotation_size = int(
            args.subcluster_annotation_size
        )  # can't set it in add_argument(), sigh
    if args.subcluster_annotation_size is not None:
        if args.calculate_alternative_annotations or args.write_additional_cluster_annotations is not None:
            raise Exception(
                'can\'t set either --calculate-alternative-annotations or --write-additional-cluster-annotations if --subcluster-annotation-size is also set (you get duplicate annotations, which confuses and crashes things, plus it doesn\'t really make sense -- alternative annotations should be calculated on the subcluster annotations now)'
            )
    if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None:  # handle existing old-style output
        assert args.outfname is not None
        if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'):
            args.persistent_cachefname = utils.getprefix(
                args.outfname
            ) + '-hmm-cache.csv'  # written by bcrham, so has to be csv, not yaml

    if args.min_largest_cluster_size is not None and args.n_final_clusters is not None:
        print '  note: both --min-largest-cluster-size and --n-final-clusters are set, which means we\'ll stop clustering when *either* of their criteria are satisfied (not both)'  # maybe it should be both, but whatever

    if not args.paired_loci and (args.action == 'get-selection-metrics'
                                 or args.get_selection_metrics):
        if args.outfname is None and args.selection_metric_fname is None:
            print '    %s calculating selection metrics, but neither --outfname nor --selection-metric-fname were set, which means nothing will be written to disk' % utils.color(
                'yellow', 'warning')
        elif args.selection_metric_fname is None and args.action == 'get-selection-metrics' and not args.add_selection_metrics_to_outfname:
            args.selection_metric_fname = utils.insert_before_suffix(
                '-selection-metrics', args.outfname)

    if args.plot_annotation_performance:
        if args.plotdir is None and args.print_n_worst_annotations is None:
            raise Exception(
                'doesn\'t make sense to set --plot-annotation-performance but not either of --plotdir or --print-n-worst-annotations (we\'ll spend all the cycles counting things up but then they\'ll just disappear from memory without being recorded).'
            )
        if not args.is_simu:
            raise Exception(
                'can\'t plot performance unless --is-simu is set (and this is simulation)'
            )
    if args.print_n_worst_annotations is not None and not args.plot_annotation_performance:
        raise Exception(
            '--plot-annotation-performance must be set if you\'re setting --print-worst-annotations'
        )
    if not args.paired_loci and (
            args.action == 'plot-partitions' or args.action == 'annotate'
            and args.plot_partitions) and args.plotdir is None:
        raise Exception('--plotdir must be specified if plotting partitions')
    if args.action == 'annotate' and args.plot_partitions and args.input_partition_fname is None:  # could set this up to use e.g. --simultaneous-true-clonal-seqs as well, but it can't atm
        print '  %s running annotate with --plot-partitions, but --input-partition-fname is not set, which likely means the partitions will be trivial/singleton partitions' % utils.color(
            'yellow', 'warning')

    if args.make_per_gene_per_base_plots and not args.make_per_gene_plots:  # the former doesn't do anything unless the latter is turned on
        args.make_per_gene_plots = True

    if args.action == 'simulate':
        if args.n_trees is None and not args.paired_loci:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.n_procs > args.n_sim_events:
            print '  note: reducing --n-procs to %d (was %d) so it isn\'t bigger than --n-sim-events' % (
                args.n_sim_events, args.n_procs)
            args.n_procs = args.n_sim_events
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        if args.outfname is None and args.paired_outdir is None:
            print '  note: no %s specified, so nothing will be written to disk' % (
                '--paired-outdir' if args.paired_loci else '--outfname')
            args.outfname = get_dummy_outfname(
                args.workdir
            )  # hackey, but otherwise I have to rewrite the whole run_simulation() in bin/partis to handle None type outfname

        if args.simulate_from_scratch:
            args.rearrange_from_scratch = True
            args.mutate_from_scratch = True
        if args.rearrange_from_scratch and not args.force_dont_generate_germline_set:  # i would probably just default to always generating germline sets when rearranging from scratch, but bin/test-germline-inference.py (and any other case where you want to dramatically restrict the germline set) really argue for a way to force just using the genes in the germline dir
            args.generate_germline_set = True
        if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
            assert args.mutate_from_scratch
        if args.mutate_from_scratch and not args.no_per_base_mutation:
            print '  note: setting --no-per-base-mutation since --mutate-from-scratch was set'
            args.no_per_base_mutation = True

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception(
                    'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)'
                )
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception(
                    'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set'
                )
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir'
            )
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir'
            )
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception(
                'have to either set --rearrange-from-scratch or --reco-parameter-dir (or --simulate-from-scratch)'
            )
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception(
                'have to either set --mutate-from-scratch or --shm-parameter-dir (or --simulate-from-scratch)'
            )

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception(
                'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)'
            )

        if args.generate_germline_set:
            args.snp_positions = None  # if you want to control the exact positions, you have to use bin/test-germline-inference.py
            args.indel_positions = None
            process_gls_gen_args(args)

        if args.generate_trees:
            assert args.n_procs == 1  # not set up to handle output, and also no need

        if args.treefname is not None:
            raise Exception(
                '--treefname was set for simulation action (probably meant to use --input-simulation-treefname)'
            )

    if args.parameter_dir is not None and not args.paired_loci:  # if we're splitting loci, this isn't the normal parameter dir, it's a parent of that
        args.parameter_dir = args.parameter_dir.rstrip('/')
        if os.path.exists(args.parameter_dir):
            pdirs = [
                d for d in os.listdir(args.parameter_dir) if os.path.isdir(d)
            ]
            if len(pdirs) > 0 and len(
                    set(pdirs) & set(utils.parameter_type_choices)) == 0:
                raise Exception(
                    'couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?'
                    % (args.parameter_dir, ' '.join(
                        utils.parameter_type_choices), ' '.join(
                            os.listdir(args.parameter_dir))))

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.action not in actions_not_requiring_input and [
            args.infname, args.paired_indir
    ].count(None) == 2:
        if args.paired_loci:
            raise Exception(
                '--infname or --paired-indir is required for action \'%s\' with --paired-loci'
                % args.action)
        else:
            raise Exception('--infname is required for action \'%s\'' %
                            args.action)

    if args.action == 'get-linearham-info':
        if args.linearham_info_fname is None:  # for some reason setting required=True isn't working
            raise Exception('have to specify --linearham-info-fname')
        if args.sw_cachefname is None and args.parameter_dir is None:
            raise Exception(
                'have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs'
            )
        if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns:
            args.extra_annotation_columns = utils.add_lists(
                args.extra_annotation_columns, ['linearham-info'])

    if args.ete_path is not None and args.ete_path == 'None':  # it's nice to be able to unset this from the command line (so we don't make the slow tree plots)
        args.ete_path = None
Esempio n. 58
0
    def read_insertion_info(self, approved_genes):
        iprobs, icontentprobs = {}, {}
        genes_used = set()
        for insertion in self.insertions:
            iprobs[insertion] = {}
            if approved_genes[0] == glutils.dummy_d_genes[self.args.locus]:
                iprobs[insertion][0] = 1.  # always insert zero bases
                icontentprobs[insertion] = {n: 0.25 for n in utils.nukes}
                continue
            deps = utils.column_dependencies[insertion + '_insertion']
            with open(
                    self.indir + '/' + utils.get_parameter_fname(
                        column=insertion + '_insertion', deps=deps),
                    'r') as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[
                            self.region +
                            '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in iprobs[insertion]:
                        iprobs[insertion][n_inserted] = 0.0
                    iprobs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(iprobs[insertion]) == 0:
                raise Exception(
                    'didn\'t read any %s insertion probs from %s' %
                    (insertion, self.indir + '/' + utils.get_parameter_fname(
                        column=insertion + '_insertion', deps=deps)))

            # print '   interpolate insertions'
            interpolate_bins(
                iprobs[insertion], self.n_max_to_interpolate, bin_eps=self.eps
            )  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in iprobs[insertion] or len(
                    iprobs[insertion]
            ) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                iprobs[insertion][0] = 1
                iprobs[insertion][1] = 1
                if self.debug:
                    print '      ', iprobs[insertion]

            assert 0 in iprobs[insertion] and len(
                iprobs[insertion]
            ) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in iprobs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in iprobs[insertion]:
                iprobs[insertion][n_inserted] /= total
                test_total += iprobs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in iprobs[insertion] or iprobs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', iprobs[
                    insertion]
                assert False

            icontentprobs[insertion] = self.read_insertion_content(
                insertion)  # also read the base content of the insertions

        if len(genes_used
               ) > 1:  # if length is 1, we will have just used the actual gene
            if self.debug:
                print '    insertions used:', ' '.join(genes_used)

        return iprobs, icontentprobs