Exemple #1
0
    def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = glfo['cyst-positions']
        self.tryp_positions = glfo['tryp-positions']

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
Exemple #2
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry 

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)})  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
Exemple #3
0
    def read_mute_freq_stuff(self, gene):
        assert gene[:
                    2] not in utils.boundaries  # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d)
        if self.args.mutate_from_scratch:
            self.all_mute_freqs[gene] = {
                'overall_mean': self.args.scratch_mute_freq
            }
        else:
            approved_genes = [gene]

            # ok this is kind of dumb, but I need to figure out how many counts there are for this gene, even when we have only an shm parameter dir
            tmp_reco_param_dir = self.reco_parameter_dir if self.reco_parameter_dir is not None else self.shm_parameter_dir  # will crash if the shm parameter dir doesn't have gene count info... but we should only end up using it on data/recombinator/scratch-parameters
            gene_counts = utils.read_overall_gene_probs(
                tmp_reco_param_dir,
                only_gene=gene,
                normalize=False,
                expect_zero_counts=True)
            if gene_counts < self.args.min_observations_per_gene:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's in <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
                approved_genes += utils.find_replacement_genes(
                    tmp_reco_param_dir,
                    min_counts=self.args.min_observations_per_gene,
                    gene_name=gene)

            self.all_mute_freqs[
                gene] = paramutils.read_mute_freqs_with_weights(
                    self.shm_parameter_dir, approved_genes)
Exemple #4
0
    def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None):
        self.parameter_dir = parameter_dir
        self.plotdir = plotdir
        self.args = args
        self.input_info = input_info
        self.reco_info = reco_info
        self.germline_seqs = germline_seqs
        self.pcounter, self.true_pcounter = None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.germline_seqs)
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.germline_seqs)
        self.info = {}
        self.info['all_best_matches'] = set()  # set of all the matches we found (for *all* queries)
        self.info['skipped_unproductive_queries'] = []  # list of unproductive queries
        if self.args.apply_choice_probs_in_sw:
            if self.args.debug:
                print '  reading gene choice probs from',parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir)

        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        self.outfile = None
        if self.args.outfname != None:
            self.outfile = open(self.args.outfname, 'a')

        self.n_unproductive = 0
        self.n_total = 0
Exemple #5
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if self.args.mutate_from_scratch:  # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was
         self.all_mute_freqs[gene_or_insert_name] = {
             'overall_mean': self.args.flat_mute_freq
         }
     elif gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(
             self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[
             gene_or_insert_name], _ = paramutils.read_mute_info(
                 self.parameter_dir,
                 this_gene=gene_or_insert_name,
                 locus=self.args.locus,
                 approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(
             self.parameter_dir,
             only_gene=gene_or_insert_name,
             normalize=False,
             expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(
                 self.parameter_dir,
                 min_counts=self.args.min_observations_to_write,
                 gene_name=gene_or_insert_name)
         self.all_mute_freqs[
             gene_or_insert_name], _ = paramutils.read_mute_info(
                 self.parameter_dir,
                 this_gene=gene_or_insert_name,
                 locus=self.args.locus,
                 approved_genes=replacement_genes)
Exemple #6
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name, single_gene=False)
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes)
Exemple #7
0
 def read_mute_freq_stuff(self, gene_or_insert_name):
     if self.args.mutate_from_scratch:  # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was
         self.all_mute_freqs[gene_or_insert_name] = {'overall_mean' : self.args.flat_mute_freq}
     elif gene_or_insert_name[:2] in utils.boundaries:
         replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v')
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes)
     else:
         gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True)
         replacement_genes = None
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name)
         self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes)
Exemple #8
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
Exemple #9
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
Exemple #10
0
    def __init__(self, args, input_info, reco_info, glfo, parameter_dir, write_parameters, genes_to_use):
        print 'smith-waterman'
        sys.stdout.flush()

        self.parameter_dir = parameter_dir
        self.args = args
        self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug

        self.max_insertion_length = 35  # if vdjalign reports an insertion longer than this, rerun the query (typically with different match/mismatch ratio)
        self.absolute_max_insertion_length = 200  # just ignore them if it's longer than this

        self.input_info = input_info
        self.remaining_queries = set([q for q in self.input_info.keys()])  # we remove queries from this set when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores)
        self.new_indels = 0  # number of new indels that were kicked up this time through

        self.reco_info = reco_info
        self.glfo = glfo
        self.pcounter, self.true_pcounter, self.perfplotter = None, None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.glfo['seqs'])
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.glfo['seqs'])
        if self.args.plot_performance:
            self.perfplotter = PerformancePlotter(self.glfo['seqs'], 'sw')
        self.info = {}
        self.info['queries'] = []  # list of queries that *passed* sw, i.e. for which we have information
        self.info['all_best_matches'] = set()  # set of all the matches we found (for *all* queries)
        self.info['indels'] = {}
        if self.args.apply_choice_probs_in_sw:
            if self.debug:
                print '  reading gene choice probs from', parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir)

        self.outfile = None
        if self.args.outfname is not None:
            self.outfile = open(self.args.outfname, 'a')

        self.nth_try = 1
        self.unproductive_queries = set()

        # rewrite input germline sets (if needed)
        self.genes_to_use = genes_to_use  # if None, we use all of 'em. NOTE do *not* use self.args.only_genes in this file (see partitiondriver)
        self.my_datadir = self.args.datadir  # make sure to use *only* use <self.my_datadir> elsewhere
        if self.genes_to_use is not None:
            self.my_datadir = self.args.workdir + '/germline-sets'
            self.rewritten_files = utils.rewrite_germline_fasta(self.args.datadir, self.my_datadir, self.genes_to_use)

        if not os.path.exists(self.args.ighutil_dir + '/bin/vdjalign'):
            raise Exception('ERROR ighutil path d.n.e: ' + self.args.ighutil_dir + '/bin/vdjalign')
Exemple #11
0
    def __init__(self,
                 args,
                 input_info,
                 reco_info,
                 germline_seqs,
                 parameter_dir,
                 write_parameters=False,
                 plotdir=None):
        self.parameter_dir = parameter_dir
        self.plotdir = plotdir
        self.args = args
        self.input_info = input_info
        self.reco_info = reco_info
        self.germline_seqs = germline_seqs
        self.pcounter, self.true_pcounter = None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.germline_seqs)
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.germline_seqs)
        self.info = {}
        self.info['all_best_matches'] = set(
        )  # set of all the matches we found (for *all* queries)
        self.info['skipped_unproductive_queries'] = [
        ]  # list of unproductive queries
        if self.args.apply_choice_probs_in_sw:
            if self.args.debug:
                print '  reading gene choice probs from', parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(
                parameter_dir)

        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        self.outfile = None
        if self.args.outfname != None:
            self.outfile = open(self.args.outfname, 'a')

        self.n_unproductive = 0
        self.n_total = 0
Exemple #12
0
    def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False):
        self.parameter_dir = parameter_dir
        self.args = args
        self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug

        self.input_info = input_info
        self.remaining_queries = [query for query in self.input_info.keys()]  # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores)
        self.new_indels = 0  # number of new indels that were kicked up this time through

        self.reco_info = reco_info
        self.germline_seqs = germline_seqs
        self.pcounter, self.true_pcounter, self.perfplotter = None, None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.germline_seqs)
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.germline_seqs)
        if self.args.plot_performance:
            self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw')
        self.info = {}
        self.info['queries'] = []
        self.info['all_best_matches'] = set()  # set of all the matches we found (for *all* queries)
        self.info['skipped_unproductive_queries'] = []  # list of unproductive queries
        # self.info['skipped_indel_queries'] = []  # list of queries that had indels
        self.info['skipped_unknown_queries'] = []
        self.info['indels'] = {}
        if self.args.apply_choice_probs_in_sw:
            if self.debug:
                print '  reading gene choice probs from', parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir)

        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        self.outfile = None
        if self.args.outfname is not None:
            self.outfile = open(self.args.outfname, 'a')

        self.n_unproductive = 0
        self.n_total = 0

        print 'smith-waterman'
Exemple #13
0
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
    def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False):
        """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """
        replacement_genes = None
        if is_insertion:
            replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v')
        else:
            n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
            if n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
                # print '    only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences)
                replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False)

        mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes)
        rates = []  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(seq)  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with opener('w')(reco_seq_fname) as reco_seq_file:
            reco_seq_file.write('state\trate\n')
            for inuke in range(len(seq)):
                reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
Exemple #15
0
def print_data_table(dsetfos, method, latex=False, emph_genes=['IGHV1-2*02+G35A', 'IGHD3-10*01', 'IGHJ4*02', 'IGKV3-15*01', 'IGKJ3*01']):
    latex = True
    def getvalstr(gene, val):
        if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
            return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
        else:
            if latex:
                gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
                if emph_genes is not None and gene in emph_genes:
                    gstr = '\\color{red}{\\textbf{%s}}' % gstr
            else:
                gstr = utils.color_gene(gene, width=18)
            return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
    def print_line(rfos):
        print '  %s%s'  % ('   '.join([getvalstr(g, v) for g, v in rfos]), lstr)
    def ds_str(ds, region):
        lstr = ds.split('-')[1]
        return ('IG%s%s' % (('h' if lstr in ['g', 'm'] else lstr).upper(), region.upper())) if latex else ds

    cstr = '&' if latex else ''
    estr = '$' if latex else ''
    lstr = '\\\\' if latex else ''
    for region in utils.regions:
        param_dirs = [get_param_dir(heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset, method) for study, dset in dsetfos]
        countfos = [utils.read_overall_gene_probs(pdir, normalize=True)[region] for pdir in param_dirs]
        gene_val_str = (' %s   ' % cstr).join([('  %s   %s   %-20s' % ('\\%' if latex else '', cstr, ds_str(ds, region))) for _, ds in dsetfos])
        tmpline = '  %s   %s  %s' % (cstr, gene_val_str, lstr)
        if latex:
            hstr = '\\hline'
            tmpline = '  %s\n%s\n  %s' % (hstr, tmpline, hstr)
        print tmpline
        rowfos = [sorted(cfo.items(), key=operator.itemgetter(1), reverse=True) for cfo in countfos]
        irow = 0
        while True:
            rfos = [rfo[irow] if irow < len(rfo) else (None, None) for rfo in rowfos]
            if set(rfos) == set([(None, None)]):
                break
            print_line(rfos)
            irow += 1
Exemple #16
0
 def read_mute_freq_stuff(self, gene):
     assert gene[:
                 2] not in utils.boundaries  # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d)
     if self.args.mutate_from_scratch:
         self.all_mute_freqs[gene] = {
             'overall_mean':
             self.args.default_scratch_mute_freq if
             self.args.flat_mute_freq is None else self.args.flat_mute_freq
         }
     else:
         gene_counts = utils.read_overall_gene_probs(
             self.parameter_dir,
             only_gene=gene,
             normalize=False,
             expect_zero_counts=True)
         approved_genes = [gene]
         if gene_counts < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true
             approved_genes += utils.find_replacement_genes(
                 self.parameter_dir,
                 min_counts=self.args.min_observations_to_write,
                 gene_name=gene)
         self.all_mute_freqs[
             gene] = paramutils.read_mute_freqs_with_weights(
                 self.parameter_dir, approved_genes)
    def __init__(self,
                 args,
                 input_info,
                 reco_info,
                 germline_seqs,
                 parameter_dir,
                 write_parameters=False):
        self.parameter_dir = parameter_dir
        self.args = args
        self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug

        self.input_info = input_info
        self.remaining_queries = [
            query for query in self.input_info.keys()
        ]  # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores)
        self.new_indels = 0  # number of new indels that were kicked up this time through

        self.reco_info = reco_info
        self.germline_seqs = germline_seqs
        self.pcounter, self.true_pcounter, self.perfplotter = None, None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.germline_seqs)
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.germline_seqs)
        if self.args.plot_performance:
            self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw')
        self.info = {}
        self.info['queries'] = []
        self.info['all_best_matches'] = set(
        )  # set of all the matches we found (for *all* queries)
        self.info['skipped_unproductive_queries'] = [
        ]  # list of unproductive queries
        # self.info['skipped_indel_queries'] = []  # list of queries that had indels
        self.info['skipped_unknown_queries'] = []
        self.info['indels'] = {}
        if self.args.apply_choice_probs_in_sw:
            if self.debug:
                print '  reading gene choice probs from', parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(
                parameter_dir)

        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        self.outfile = None
        if self.args.outfname is not None:
            self.outfile = open(self.args.outfname, 'a')

        self.n_unproductive = 0
        self.n_total = 0

        print 'smith-waterman'
Exemple #18
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][
            gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {
            r: glfo[c + '-positions']
            for r, c in utils.conserved_codons[args.locus].items()
        }

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {
            'fv': 1.5,
            'jf': 25
        }  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths
        self.mute_freq_bounds = {
            'lo': 0.01,
            'hi': 0.5
        }  # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time
        self.enforced_flat_mfreq_length = {  # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works
            'v_3p' : 9,
            'd_5p' : 9,
            'd_3p' : 9,
            'j_5p' : 20,
        }

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[
            0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(
            self.indir, gene_name, debug=self.debug
        )  # how many times did we observe this gene in data?
        approved_genes = [gene_name]
        # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences)
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (
                    self.n_occurences, self.args.min_observations_to_write)
            approved_genes += utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                debug=self.debug)

        self.erosion_probs = self.read_erosion_info(approved_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(
            approved_genes)
        self.mute_freqs = paramutils.read_mute_freqs_with_weights(
            self.indir, approved_genes)  # weighted averages over genes
        self.mute_counts = paramutils.read_mute_counts(
            self.indir, gene_name, self.args.locus)  # raw per-{ACGT} counts
        self.process_mutation_info(
        )  # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts>
        # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, self.track.getdict()
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir +
                                  '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
        self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[
            'unweighted_overall_mean']  # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq,
                 args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions  # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(
            self.indir, only_gene=gene_name, normalize=False
        )  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                single_gene=False,
                debug=self.args.debug)

        self.read_erosion_info(
            gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(
                self.indir,
                this_gene=gene_name,
                approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, {'nukes': list(utils.nukes)}
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = germline_seqs  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = cyst_positions
        self.tryp_positions = tryp_positions

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()