Example #1
0
 def clean(self):
     """ remove all the parameter files """
     self.mutefreqer.clean()
     for column in self.counts:
         if column == 'all':
             os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all'))
         else:
             index = [column,] + utils.column_dependencies[column]
             os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index))
 def clean(self):
     """ remove all the parameter files """
     self.mutefreqer.clean()
     for column in self.counts:
         if column == 'all':
             os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all'))
         else:
             index = [column,] + utils.column_dependencies[column]
             os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index))
    def write(self, base_outdir):
        print '  writing parameters'
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        mute_start = time.time()
        self.mutefreqer.write(
            base_outdir,
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column:
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '    parameter write time: %.3f' % (time.time() - start)
Example #4
0
    def read_vdj_version_freqs(self):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        if self.args.rearrange_from_scratch:
            return None

        version_freq_table = {}
        with open(self.reco_parameter_dir + '/' + utils.get_parameter_fname('all', 'r')) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:  # NOTE do *not* assume the file is sorted
                skip = False
                for region in utils.regions:
                    if line[region + '_gene'] not in self.glfo['seqs'][region]:
                        skip = True
                        break
                if skip:
                    continue
                total += float(line['count'])
                index = self.freqtable_index(line)
                assert index not in version_freq_table
                version_freq_table[index] = float(line['count'])

        if len(version_freq_table) == 0:
            raise Exception('didn\'t find any gene combinations in %s' % fname)

        # then normalize
        test_total = 0.0
        for index in version_freq_table:
            version_freq_table[index] /= total
            test_total += version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(version_freq_table) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
        return version_freq_table
    def read_insertion_info(self, this_gene, approved_genes=None):
        if approved_genes == None:  # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm
            approved_genes = [this_gene,]

        genes_used = set()
        for insertion in self.insertions:
            self.insertion_probs[insertion] = {}
            deps = utils.column_dependencies[insertion + '_insertion']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in self.insertion_probs[insertion]:
                        self.insertion_probs[insertion][n_inserted] = 0.0
                    self.insertion_probs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            assert len(self.insertion_probs[insertion]) > 0

            # print '   interpolate insertions'
            interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps)  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.args.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                self.insertion_probs[insertion][0] = 1
                self.insertion_probs[insertion][1] = 1
                if self.args.debug:
                    print '      ', self.insertion_probs[insertion]

            assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in self.insertion_probs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in self.insertion_probs[insertion]:
                self.insertion_probs[insertion][n_inserted] /= total
                test_total += self.insertion_probs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion]
                assert False

            # self.insertion_content_probs = {}
            self.read_insertion_content(insertion)  # also read the base content of the insertions

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print '    insertions used:', ' '.join(genes_used)
Example #6
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes is None:
            approved_genes = [this_gene, ]
        eprobs = {}
        genes_used = set()
        for erosion in utils.real_erosions + utils.effective_erosions:
            if erosion[0] != self.region:
                continue
            eprobs[erosion] = {}
            if this_gene == glutils.dummy_d_genes[self.args.chain]:
                eprobs[erosion][0] = 1.  # always erode zero bases
                continue
            deps = utils.column_dependencies[erosion + '_del']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + '_del']) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + '_del'])
                    if n_eroded not in eprobs[erosion]:
                        eprobs[erosion][n_eroded] = 0.0
                    eprobs[erosion][n_eroded] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(eprobs[erosion]) == 0:
                raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)))

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if erosion in utils.real_erosions:  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(eprobs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in eprobs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in eprobs[erosion]:
                eprobs[erosion][n_eroded] /= total
                test_total += eprobs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1 and self.debug:  # if length is 1, we will have just used the actual gene
            print '    used erosion info from:', ' '.join(genes_used)

        return eprobs
Example #7
0
    def read_vdj_version_freqs(self):
        """ Read the frequencies at which various VDJ combinations appeared in data """
        if self.args.rearrange_from_scratch:
            return None

        version_freq_table = {}
        with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile:
            in_data = csv.DictReader(infile)
            total = 0.0
            for line in in_data:  # NOTE do *not* assume the file is sorted
                skip = False
                for region in utils.regions:
                    if line[region + '_gene'] not in self.glfo['seqs'][region]:
                        skip = True
                        break
                if skip:
                    continue
                total += float(line['count'])
                index = self.freqtable_index(line)
                assert index not in version_freq_table
                version_freq_table[index] = float(line['count'])

        if len(version_freq_table) == 0:
            raise Exception('didn\'t find any gene combinations in %s' % fname)

        # then normalize
        test_total = 0.0
        for index in version_freq_table:
            version_freq_table[index] /= total
            test_total += version_freq_table[index]
        assert utils.is_normed(test_total, this_eps=1e-8)
        assert len(version_freq_table) < 1e8  # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
        return version_freq_table
Example #8
0
    def read_insertion_info(self, this_gene, approved_genes=None):
        if approved_genes == None:  # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm
            approved_genes = [this_gene,]

        genes_used = set()
        for insertion in self.insertions:
            self.insertion_probs[insertion] = {}
            deps = utils.column_dependencies[insertion + '_insertion']
            with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in self.insertion_probs[insertion]:
                        self.insertion_probs[insertion][n_inserted] = 0.0
                    self.insertion_probs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            assert len(self.insertion_probs[insertion]) > 0

            # print '   interpolate insertions'
            interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps)  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.args.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                self.insertion_probs[insertion][0] = 1
                self.insertion_probs[insertion][1] = 1
                if self.args.debug:
                    print '      ', self.insertion_probs[insertion]

            assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in self.insertion_probs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in self.insertion_probs[insertion]:
                self.insertion_probs[insertion][n_inserted] /= total
                test_total += self.insertion_probs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion]
                assert False

            # self.insertion_content_probs = {}
            self.read_insertion_content(insertion)  # also read the base content of the insertions

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print '    insertions used:', ' '.join(genes_used)
Example #9
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes is None:
            approved_genes = [this_gene, ]
        eprobs = {}
        genes_used = set()
        for erosion in utils.all_erosions:
            if erosion[0] != self.region:
                continue
            eprobs[erosion] = {}
            if this_gene == glutils.dummy_d_genes[self.args.locus]:
                eprobs[erosion][0] = 1.  # always erode zero bases
                continue
            deps = utils.column_dependencies[erosion + '_del']
            with open(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps), 'r') as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + '_del']) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + '_del'])
                    if n_eroded not in eprobs[erosion]:
                        eprobs[erosion][n_eroded] = 0.0
                    eprobs[erosion][n_eroded] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(eprobs[erosion]) == 0:
                raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)))

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if erosion in utils.real_erosions:  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(eprobs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in eprobs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in eprobs[erosion]:
                eprobs[erosion][n_eroded] /= total
                test_total += eprobs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1 and self.debug:  # if length is 1, we will have just used the actual gene
            print '    used erosion info from:', ' '.join(genes_used)

        return eprobs
Example #10
0
    def read_erosion_info(self, this_gene, approved_genes=None):
        # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
        if approved_genes == None:
            approved_genes = [this_gene]
        genes_used = set()
        for erosion in utils.real_erosions + utils.effective_erosions:
            if erosion[0] != self.region:
                continue
            self.erosion_probs[erosion] = {}
            deps = utils.column_dependencies[erosion + "_del"]
            with opener("r")(
                self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps)
            ) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if (
                        self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes
                    ):  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue
                    # then skip nonsense erosions that're too long for this gene, but were ok for another
                    if int(line[erosion + "_del"]) >= len(self.germline_seq):
                        continue

                    # then add in this erosion's counts
                    n_eroded = int(line[erosion + "_del"])
                    if n_eroded not in self.erosion_probs[erosion]:
                        self.erosion_probs[erosion][n_eroded] = 0.0
                    self.erosion_probs[erosion][n_eroded] += float(line["count"])

                    if self.region + "_gene" in line:
                        genes_used.add(line[self.region + "_gene"])

            assert len(self.erosion_probs[erosion]) > 0

            # do some smoothingy things NOTE that we normalize *after* interpolating
            if (
                erosion in utils.real_erosions
            ):  # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
                n_max = self.n_max_to_interpolate
            else:  # for fake erosions, always interpolate
                n_max = -1
            # print '   interpolate erosions'
            interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
            self.add_pseudocounts(self.erosion_probs[erosion])

            # and finally, normalize
            total = 0.0
            for _, val in self.erosion_probs[erosion].iteritems():
                total += val

            test_total = 0.0
            for n_eroded in self.erosion_probs[erosion]:
                self.erosion_probs[erosion][n_eroded] /= total
                test_total += self.erosion_probs[erosion][n_eroded]
            assert utils.is_normed(test_total)

        if len(genes_used) > 1:  # if length is 1, we will have just used the actual gene
            if self.args.debug:
                print "    erosions used:", " ".join(genes_used)
Example #11
0
    def write(self, base_outdir):
        print "    writing parameters",
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(
            base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta")
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv"
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()]
        glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == "all":
                index = tuple(list(utils.index_columns) + ["cdr3_length"])
                outfname = base_outdir + "/" + utils.get_parameter_fname(column="all")
            elif "_content" in column:
                index = [column]
                outfname = base_outdir + "/" + column + ".csv"
            else:
                index = [column] + utils.column_dependencies[column]
                outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener("w")(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append("count")
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line["count"] = count
                    out_data.writerow(line)

        print "(%.1f sec)" % (time.time() - start)
Example #12
0
    def __init__(self, args, seed, sublabel=None):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('parameter dir ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {}  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.glfo = utils.read_germline_set(self.args.datadir)

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all'))
        self.allowed_genes = self.get_allowed_genes(self.args.parameter_dir)  # only really used if <self.args.uniform_vj_choice_probs> is set, but it also checks the sensibility of <self.args.only_genes>
        self.insertion_content_probs = None
        self.read_insertion_content()

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname)
        with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        if not self.args.no_clean:
            os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Example #13
0
 def get_parameter_dir_genes(self, parameter_dir):
     parameter_dir_genes = set()
     for region in utils.regions:
         col = region + '_gene'
         column_and_deps = [col, ] + utils.column_dependencies[col]
         with open(parameter_dir + '/' + utils.get_parameter_fname(column_and_deps=column_and_deps)) as infile:
             reader = csv.DictReader(infile)
             for line in reader:
                 parameter_dir_genes.add(line[region + '_gene'])
     return parameter_dir_genes
Example #14
0
    def write(self, base_outdir, my_datadir=None):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Example #15
0
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Example #16
0
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        # mute_start = time.time()
        self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files) 
        # print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Example #17
0
 def get_allowed_genes(self, parameter_dir):
     allowed_genes = {}
     for region in [r for r in utils.regions if r != 'd']:
         genes_in_file = set()
         with open(parameter_dir + '/' + utils.get_parameter_fname(column=region + '_gene', deps=utils.column_dependencies[region + '_gene'])) as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
                 genes_in_file.add(line[region + '_gene'])
         allowed_genes[region] = genes_in_file
         if self.args.only_genes is not None:  # if --only-genes was specified, not only does the gene have to be in the parameter file, but it has to be among --only-genes
             regional_only_genes = set(g for g in self.args.only_genes if utils.get_region(g) == region)
             if len(regional_only_genes - genes_in_file) > 0:  # if command line asked for genes that aren't in the file
                 raise Exception('genes %s specified with --only-genes are not present in %s, so there\'s no information with which to simulate' % (' '.join(regional_only_genes - genes_in_file), parameter_dir))
             allowed_genes[region] &= regional_only_genes
     return allowed_genes
    def __init__(self, args, seed, sublabel=None, total_length_from_right=-1):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(
                self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot
        self.total_length_from_right = total_length_from_right  # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.)

        self.all_seqs = {}  # all the Vs, all the Ds...
        self.index_keys = {
        }  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {
        }  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.all_seqs = utils.read_germlines(self.args.datadir)
        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' +
                                    utils.get_parameter_fname('all'))
        self.read_insertion_content()
        if self.args.naivety == 'M':  # read shm info if non-naive is requested
            # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
            with opener('r')(
                    self.args.gtrfname) as gtrfile:  # read gtr parameters
                reader = csv.DictReader(gtrfile)
                for line in reader:
                    parameters = line['parameter'].split('.')
                    region = parameters[0][3].lower()
                    assert region == 'v' or region == 'd' or region == 'j'
                    model = parameters[1].lower()
                    parameter_name = parameters[2]
                    assert model in self.mute_models[region]
                    self.mute_models[region][model][parameter_name] = line[
                        'value']
            treegen = treegenerator.TreeGenerator(args,
                                                  self.args.parameter_dir,
                                                  seed=seed)
            self.treefname = self.workdir + '/trees.tre'
            treegen.generate_trees(seed, self.treefname)
            with opener('r')(
                    self.treefname
            ) as treefile:  # read in the trees (and other info) that we just generated
                self.treeinfo = treefile.readlines()
            if not self.args.no_clean:
                os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(
                self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Example #19
0
    def read_insertion_info(self, approved_genes):
        iprobs, icontentprobs = {}, {}
        genes_used = set()
        for insertion in self.insertions:
            iprobs[insertion] = {}
            if approved_genes[0] == glutils.dummy_d_genes[self.args.locus]:
                iprobs[insertion][0] = 1.  # always insert zero bases
                icontentprobs[insertion] = {n: 0.25 for n in utils.nukes}
                continue
            deps = utils.column_dependencies[insertion + '_insertion']
            with open(
                    self.indir + '/' + utils.get_parameter_fname(
                        column=insertion + '_insertion', deps=deps),
                    'r') as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
                    if self.region + '_gene' in line and line[
                            self.region +
                            '_gene'] not in approved_genes:  # NOTE you'll need to change this if you want it to depend on another region's genes
                        continue

                    # then add in this insertion's counts
                    n_inserted = 0
                    n_inserted = int(line[insertion + '_insertion'])
                    if n_inserted not in iprobs[insertion]:
                        iprobs[insertion][n_inserted] = 0.0
                    iprobs[insertion][n_inserted] += float(line['count'])

                    if self.region + '_gene' in line:
                        genes_used.add(line[self.region + '_gene'])

            if len(iprobs[insertion]) == 0:
                raise Exception(
                    'didn\'t read any %s insertion probs from %s' %
                    (insertion, self.indir + '/' + utils.get_parameter_fname(
                        column=insertion + '_insertion', deps=deps)))

            # print '   interpolate insertions'
            interpolate_bins(
                iprobs[insertion], self.n_max_to_interpolate, bin_eps=self.eps
            )  #, max_bin=len(self.germline_seq))  # NOTE that we normalize *after* this

            if 0 not in iprobs[insertion] or len(
                    iprobs[insertion]
            ) < 2:  # all hell breaks loose lower down if we haven't got shit in the way of information
                if self.debug:
                    print '    WARNING adding pseudocount to 1-bin in insertion probs'
                iprobs[insertion][0] = 1
                iprobs[insertion][1] = 1
                if self.debug:
                    print '      ', iprobs[insertion]

            assert 0 in iprobs[insertion] and len(
                iprobs[insertion]
            ) >= 2  # all hell breaks loose lower down if we haven't got shit in the way of information

            # and finally, normalize
            total = 0.0
            for _, val in iprobs[insertion].iteritems():
                total += val
            test_total = 0.0
            for n_inserted in iprobs[insertion]:
                iprobs[insertion][n_inserted] /= total
                test_total += iprobs[insertion][n_inserted]
            assert utils.is_normed(test_total)

            if 0 not in iprobs[insertion] or iprobs[insertion][0] == 1.0:
                print 'ERROR cannot have all or none of the probability mass in the zero bin:', iprobs[
                    insertion]
                assert False

            icontentprobs[insertion] = self.read_insertion_content(
                insertion)  # also read the base content of the insertions

        if len(genes_used
               ) > 1:  # if length is 1, we will have just used the actual gene
            if self.debug:
                print '    insertions used:', ' '.join(genes_used)

        return iprobs, icontentprobs
Example #20
0
    def __init__(self, args, seed, sublabel=None, total_length_from_right=-1):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot
        self.total_length_from_right = total_length_from_right  # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.)
    
        self.all_seqs = {}  # all the Vs, all the Ds...
        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {}  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.all_seqs = utils.read_germlines(self.args.datadir)
        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all'))
        self.read_insertion_content()
        if self.args.naivety == 'M':  # read shm info if non-naive is requested
            # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
            with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
                reader = csv.DictReader(gtrfile)
                for line in reader:
                    parameters = line['parameter'].split('.')
                    region = parameters[0][3].lower()
                    assert region == 'v' or region == 'd' or region == 'j'
                    model = parameters[1].lower()
                    parameter_name = parameters[2]
                    assert model in self.mute_models[region]
                    self.mute_models[region][model][parameter_name] = line['value']
            treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed)
            self.treefname = self.workdir + '/trees.tre'
            treegen.generate_trees(seed, self.treefname)
            with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
                self.treeinfo = treefile.readlines()
            if not self.args.no_clean:
                os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))