Exemple #1
0
def write_glfo(output_dir, glfo, only_genes=None, debug=False):
    if debug:
        print '  writing glfo to %s%s' % (output_dir, '' if only_genes is None else ('  (restricting to %d genes)' % len(only_genes)))
    if os.path.exists(output_dir + '/' + glfo['chain']):
        remove_glfo_files(output_dir, glfo['chain'])  # also removes output_dir
    os.makedirs(output_dir + '/' + glfo['chain'])

    for fname in glfo_fasta_fnames(glfo['chain']):
        with open(output_dir + '/' + glfo['chain'] + '/' + fname, 'w') as outfile:
            for gene in glfo['seqs'][utils.get_region(fname)]:
                if only_genes is not None and gene not in only_genes:
                    continue
                outfile.write('>' + gene + '\n')
                outfile.write(glfo['seqs'][utils.get_region(fname)][gene] + '\n')
    for fname in glfo_csv_fnames():
        with open(output_dir + '/' + glfo['chain'] + '/' + fname, 'w') as codonfile:
            writer = csv.DictWriter(codonfile, ('gene', 'istart'))
            writer.writeheader()
            for gene, istart in glfo[utils.get_codon(fname) + '-positions'].items():
                if only_genes is not None and gene not in only_genes:
                    continue
                writer.writerow({'gene' : gene, 'istart' : istart})

    # make sure there weren't any files lingering in the output dir when we started
    # NOTE this will ignore the dirs corresponding to any *other* chains (which is what we want now, I think)
    unexpected_files = set(glob.glob(output_dir + '/' + glfo['chain'] + '/*')) - set([output_dir + '/' + glfo['chain'] + '/' + fn for fn in glfo_fnames(glfo['chain'])])
    if len(unexpected_files) > 0:
        raise Exception('unexpected file(s) while writing germline set: %s' % (' '.join(unexpected_files)))
Exemple #2
0
def write_glfo(output_dir, glfo, only_genes=None, debug=False):
    if debug:
        print '  writing glfo to %s%s' % (output_dir, '' if only_genes is None else ('  (restricting to %d genes)' % len(only_genes)))
    if os.path.exists(output_dir + '/' + glfo['locus']):
        remove_glfo_files(output_dir, glfo['locus'])  # also removes output_dir
    os.makedirs(output_dir + '/' + glfo['locus'])

    for fname in glfo_fasta_fnames(glfo['locus']):
        with open(output_dir + '/' + glfo['locus'] + '/' + fname, 'w') as outfile:
            for gene in glfo['seqs'][utils.get_region(fname)]:
                if only_genes is not None and gene not in only_genes:
                    continue
                outfile.write('>' + gene + '\n')
                outfile.write(glfo['seqs'][utils.get_region(fname)][gene] + '\n')

    with open(output_dir + '/' + glfo['locus'] + '/' + extra_fname, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, csv_headers)
        writer.writeheader()
        for region, codon in utils.conserved_codons[glfo['locus']].items():
            for gene, istart in glfo[codon + '-positions'].items():
                if only_genes is not None and gene not in only_genes:
                    continue
                writer.writerow({'gene' : gene, codon + '_position' : istart})

    # make sure there weren't any files lingering in the output dir when we started
    # NOTE this will ignore the dirs corresponding to any *other* loci (which is what we want now, I think)
    unexpected_files = set(glob.glob(output_dir + '/' + glfo['locus'] + '/*')) - set([output_dir + '/' + glfo['locus'] + '/' + fn for fn in glfo_fnames(glfo['locus'])])
    if len(unexpected_files) > 0:
        raise Exception('unexpected file(s) while writing germline set: %s' % (' '.join(unexpected_files)))
Exemple #3
0
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False):
    """
    Generate some snp'd genes and add them to glfo, specified with <snps_to_add>.
    e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location.
    The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G
    """

    templates_to_remove = set()

    for isnp in range(len(snps_to_add)):
        snpinfo = snps_to_add[isnp]
        gene, positions = snpinfo['gene'], snpinfo['positions']
        print '    adding %d %s to %s' % (len(positions), utils.plural_str('snp', len(positions)), gene)
        seq = glfo['seqs'][utils.get_region(gene)][gene]
        assert utils.get_region(gene) == 'v'
        cpos = glfo['cyst-positions'][gene]

        snpfo = None
        itry = 0
        while snpfo is None or snpfo['gene'] in glfo['seqs'][utils.get_region(gene)]:
            if itry > 0:
                print '      already in glfo, try again'
                if itry > 99:
                    raise Exception('too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?')
            snpfo = generate_snpd_gene(gene, cpos, seq, positions)
            itry += 1

        if remove_template_genes:
            templates_to_remove.add(gene)
        add_new_allele(glfo, snpfo, remove_template_genes=False, debug=debug)  # *don't* remove the templates here, since we don't know if there's another snp later that needs them

    remove_the_stupid_godamn_template_genes_all_at_once(glfo, templates_to_remove)  # works fine with zero-length <templates_to_remove>
Exemple #4
0
def write_glfo(output_dir, glfo, only_genes=None, debug=False):
    if debug:
        print "  writing glfo to %s%s" % (
            output_dir,
            "" if only_genes is None else ("  (restricting to %d genes)" % len(only_genes)),
        )
    if os.path.exists(output_dir + "/" + glfo["chain"]):
        remove_glfo_files(output_dir, glfo["chain"])  # also removes output_dir
    os.makedirs(output_dir + "/" + glfo["chain"])

    for fname in glfo_fasta_fnames(glfo["chain"]):
        with open(output_dir + "/" + glfo["chain"] + "/" + fname, "w") as outfile:
            for gene in glfo["seqs"][utils.get_region(fname)]:
                if only_genes is not None and gene not in only_genes:
                    continue
                outfile.write(">" + gene + "\n")
                outfile.write(glfo["seqs"][utils.get_region(fname)][gene] + "\n")

    with open(output_dir + "/" + glfo["chain"] + "/" + extra_fname, "w") as csvfile:
        writer = csv.DictWriter(csvfile, csv_headers)
        writer.writeheader()
        for region, codon in utils.conserved_codons[glfo["chain"]].items():
            for gene, istart in glfo[codon + "-positions"].items():
                if only_genes is not None and gene not in only_genes:
                    continue
                writer.writerow({"gene": gene, codon + "_position": istart})

    # make sure there weren't any files lingering in the output dir when we started
    # NOTE this will ignore the dirs corresponding to any *other* chains (which is what we want now, I think)
    unexpected_files = set(glob.glob(output_dir + "/" + glfo["chain"] + "/*")) - set(
        [output_dir + "/" + glfo["chain"] + "/" + fn for fn in glfo_fnames(glfo["chain"])]
    )
    if len(unexpected_files) > 0:
        raise Exception("unexpected file(s) while writing germline set: %s" % (" ".join(unexpected_files)))
Exemple #5
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    for seq_record in SeqIO.parse(fname, 'fasta'):
        linefo = [p.strip() for p in seq_record.description.split('|')]

        # first get gene name
        if linefo[0][:2] != 'IG':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = linefo[imgt_info_indices.index('gene')]
            functionality = linefo[imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality == 'P':
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = linefo[0]
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))

        # then the sequence
        seq = str(seq_record.seq).upper()
        if not aligned:
            seq = utils.remove_gaps(seq)
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))

        seqs[utils.get_region(gene)][gene] = seq

    if n_skipped_pseudogenes > 0:
        print '    skipped %d pseudogenes' % n_skipped_pseudogenes
Exemple #6
0
def convert_to_duplicate_name(glfo, gene):
    for equivalence_class in duplicate_names[utils.get_region(gene)]:
        if gene in equivalence_class:
            for alternate_name in equivalence_class:
                if alternate_name != gene and alternate_name in glfo["seqs"][utils.get_region(gene)]:
                    # print 'converting %s --> %s' % (gene, alternate_name)
                    return alternate_name
    raise Exception("couldn't find alternate name for %s" % gene)
Exemple #7
0
def convert_to_duplicate_name(glfo, gene):
    for equivalence_class in duplicate_names[utils.get_region(gene)]:
        if gene in equivalence_class:
            for alternate_name in equivalence_class:
                if alternate_name != gene and alternate_name in glfo['seqs'][utils.get_region(gene)]:
                    # print 'converting %s --> %s' % (gene, alternate_name)
                    return alternate_name
    raise Exception('couldn\'t find alternate name for %s' % gene)
Exemple #8
0
 def simcountstr(
     gene, ws
 ):  # counts in simulation for <gene> (note that this is _not_ the same as sim_gene_count_str(), since this takes no account of _which_ queries these counts occur in [plus it's coming from the opposite point of view])
     if self.simglfo is None:
         rstr = ''
     elif gene in self.simglfo['seqs'][utils.get_region(gene)]:
         rstr = utils.color(
             'blue', (' %' + ws + 'd') %
             self.simcounts[utils.get_region(gene)][gene])
     else:
         rstr = utils.color('red', (' %' + ws + 's') % 'x')
     return rstr
Exemple #9
0
    def prepare_bppseqgen(self, seq, chosen_tree, n_leaf_nodes, gene, reco_event, seed):
        """ write input files and get command line options necessary to run bppseqgen on <seq> (which is a part of the full query sequence) """
        if len(seq) == 0:
            return None

        # write the tree to a tmp file
        workdir = self.workdir + '/' + utils.get_region(gene)
        os.makedirs(workdir)
        treefname = workdir + '/tree.tre'
        reco_seq_fname = workdir + '/start-seq.txt'
        leaf_seq_fname = workdir + '/leaf-seqs.fa'
        if n_leaf_nodes == 1:  # add an extra leaf to one-leaf trees so bppseqgen doesn't barf (when we read the output, we ignore the second leaf)
            lreg = re.compile('t1:[0-9]\.[0-9][0-9]*')
            leafstr = lreg.findall(chosen_tree)
            assert len(leafstr) == 1
            leafstr = leafstr[0]
            chosen_tree = chosen_tree.replace(leafstr, '(' + leafstr + ',' + leafstr + '):0.0')
        with opener('w')(treefname) as treefile:
            treefile.write(chosen_tree)
        self.write_mute_freqs(gene, seq, reco_event, reco_seq_fname)

        env = os.environ.copy()
        env["LD_LIBRARY_PATH"] += ':' + self.args.partis_dir + '/packages/bpp/lib'

        # build up the command line
        # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google
        bpp_binary = self.args.partis_dir + '/packages/bpp/bin/bppseqgen'
        if not os.path.exists(bpp_binary):
            raise Exception('bpp not found in %s' % os.path.dirname(bpp_binary))

        command = bpp_binary  # NOTE should I use the "equilibrium frequencies" option?
        command += ' alphabet=DNA'
        command += ' --seed=' + str(seed)
        command += ' input.infos=' + reco_seq_fname  # input file (specifies initial "state" for each position, and possibly also the mutation rate at that position)
        command += ' input.infos.states=state'  # column name in input file BEWARE bio++ undocumented defaults (i.e. look in the source code)
        command += ' input.tree.file=' + treefname
        command += ' input.tree.format=Newick'
        command += ' output.sequence.file=' + leaf_seq_fname
        command += ' output.sequence.format=Fasta'
        if self.args.mutate_from_scratch:
            command += ' model=JC69'
            command += ' input.infos.rates=none'  # BEWARE bio++ undocumented defaults (i.e. look in the source code)
            if self.args.flat_mute_freq is not None:
                command += ' rate_distribution=Constant'
            else:
                command += ' rate_distribution=Gamma(n=4,alpha=' + self.mute_models[utils.get_region(gene)]['gamma']['alpha']+ ')'
        else:
            command += ' input.infos.rates=rate'  # column name in input file
            pvpairs = [p + '=' + v for p, v in self.mute_models[utils.get_region(gene)]['gtr'].items()]
            command += ' model=GTR(' + ','.join(pvpairs) + ')'

        return {'cmd_str' : command, 'outfname' : leaf_seq_fname, 'workdir' : workdir, 'other-files' : [reco_seq_fname, treefname], 'env' : env}
Exemple #10
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg'))
            utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            hist = TH1D('hist_' + utils.sanitize_name(gene), '',
                        sorted_positions[-1] - sorted_positions[0] + 1,
                        sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5)
            for position in sorted_positions:
                hist.SetBinContent(hist.FindBin(position), counts[position]['freq'])
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                hist.SetBinError(hist.FindBin(position), err)
            plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg'
            xline = None
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
            plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e')  #, cwidth=4000, cheight=1000)
            paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)

        # for region in utils.regions:
        #     utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg'))
        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position))
        #         plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True)  #, cwidth=4000, cheight=1000)

        # make mean mute freq hists
        hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq')
        plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        for region in utils.regions:
            hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq')
            plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])

        # then write html file and fix permissiions
        for region in utils.regions:
            check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
            check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png'])
        check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Exemple #11
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        if not self.finalized:
            self.finalize()

        overall_plotdir = plotdir + '/overall'

        for gene in self.freqs:
            if only_overall:
                continue
            freqs = self.freqs[gene]
            if len(freqs) == 0:
                if gene not in glutils.dummy_d_genes.values():
                    print '    %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene))
                continue
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene)
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err)
            xline = None
            figsize = [7, 4]
            if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]:
                codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)]
                xline = self.glfo[codon + '-positions'][gene]
            if utils.get_region(gene) == 'v':
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j':
                figsize[0] *= 2
            plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True)
            # per-position plots:
            plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True)
            # # per-position, per-base plots:
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment

        # make mean mute freq hists
        for rstr in ['all', 'cdr3'] + utils.regions:
            if rstr == 'all':
                bounds = (0.0, 0.4)
            else:
                bounds = (0.0, 0.6 if rstr == 'd' else 0.4)
            plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True)
            plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr)
Exemple #12
0
    def prepare_bppseqgen(self, seq, chosen_tree, n_leaf_nodes, gene, reco_event, seed):
        """ write input files and get command line options necessary to run bppseqgen on <seq> (which is a part of the full query sequence) """
        if len(seq) == 0:
            return None

        # write the tree to a tmp file
        workdir = self.workdir + '/' + utils.get_region(gene)
        os.makedirs(workdir)
        treefname = workdir + '/tree.tre'
        reco_seq_fname = workdir + '/start-seq.txt'
        leaf_seq_fname = workdir + '/leaf-seqs.fa'
        # add dummy leaf that we'll subsequently ignore (such are the vagaries of bppseqgen)
        chosen_tree = '(%s,%s:%.15f):0.0;' % (chosen_tree.rstrip(';'), dummy_name_so_bppseqgen_doesnt_break, treeutils.get_mean_leaf_height(treestr=chosen_tree))
        with open(treefname, 'w') as treefile:
            treefile.write(chosen_tree)
        self.write_mute_freqs(gene, seq, reco_event, reco_seq_fname)

        env = os.environ.copy()
        env['LD_LIBRARY_PATH'] = env.get('LD_LIBRARY_PATH', '') + ':' + self.args.partis_dir + '/packages/bpp/lib'

        # build up the command line
        # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google
        bpp_binary = self.args.partis_dir + '/packages/bpp/bin/bppseqgen'
        if not os.path.exists(bpp_binary):
            raise Exception('bpp not found in %s' % os.path.dirname(bpp_binary))

        command = bpp_binary  # NOTE should I use the "equilibrium frequencies" option?
        command += ' alphabet=DNA'
        command += ' --seed=' + str(seed)
        command += ' input.infos=' + reco_seq_fname  # input file (specifies initial "state" for each position, and possibly also the mutation rate at that position)
        command += ' input.infos.states=state'  # column name in input file BEWARE bio++ undocumented defaults (i.e. look in the source code)
        command += ' input.tree.file=' + treefname
        command += ' input.tree.format=Newick'
        command += ' output.sequence.file=' + leaf_seq_fname
        command += ' output.sequence.format=Fasta'
        if self.args.mutate_from_scratch:
            command += ' model=JC69'
            command += ' input.infos.rates=none'  # BEWARE bio++ undocumented defaults (i.e. look in the source code)
            if self.args.flat_mute_freq:
                command += ' rate_distribution=Constant'
            else:
                command += ' rate_distribution=Gamma(n=4,alpha=' + self.mute_models[utils.get_region(gene)]['gamma']['alpha']+ ')'
        else:
            command += ' input.infos.rates=rate'  # column name in input file
            pvpairs = [p + '=' + v for p, v in self.mute_models[utils.get_region(gene)]['gtr'].items()]
            command += ' model=GTR(' + ','.join(pvpairs) + ')'

        return {'cmd_str' : command, 'outfname' : leaf_seq_fname, 'workdir' : workdir, 'other-files' : [reco_seq_fname, treefname], 'env' : env}
Exemple #13
0
def get_token(username, password, domain):
    region_name = utils.get_region()

    auth_data = {
        "auth": {
            "identity": {
                "password": {
                    "user": {
                        "name": username,
                        "password": password,
                        "domain": {
                            "name": domain
                        }
                    }
                },
                "methods": ["password"]
            },
            "scope": {
                "project": {
                    "name": region_name
                }
            }
        }
    }

    _url = 'https://%s/v3/auth/tokens' % ais.AisEndpoint.IAM_ENPOINT

    req = urllib2.Request(url=_url)
    req.add_header('Content-Type', 'application/json')
    req.add_data(json.dumps(auth_data))
    r = urllib2.urlopen(req)
    X_TOKEN = r.headers['X-Subject-Token']
    return X_TOKEN
Exemple #14
0
def generate_message(update: dict):
    """
    generates telegram message
    """
    android = update['android']
    codename = update['codename']
    device = update['device']
    download = update['download']
    filename = update['filename']
    filesize = update['size']
    version = update['version']
    branch = get_branch(version).capitalize()
    region = get_region(filename, codename, version)
    rom_type = get_type(filename)
    codename = codename.split('_')[0]
    message = f"New {branch} {rom_type} update available!\n"
    message += f"*Device:* {device} \n" \
        f"*Codename:* #{codename} \n" \
        f"*Region:* {region} \n" \
        f"*Version:* `{version}` \n" \
        f"*Android:* {android} \n" \
        f"*Size*: {filesize} \n" \
        f"*Download*: [Here]({download})\n\n" \
        f"[Latest Updates](https://xiaomifirmwareupdater.com/miui/{codename}) - " \
               f"[All Updates](https://xiaomifirmwareupdater.com/archive/miui/{codename})\n" \
        "@MIUIUpdatesTracker | @XiaomiFirmwareUpdater"
    return message
Exemple #15
0
def read_ramesh_file(fname, outdir, debug=False):
    seqfos = utils.read_fastx(fname)
    glseqs = {
        l: {r: {}
            for r in utils.loci[l]}
        for l in utils.loci if 'ig' in l
    }
    for sfo in seqfos:
        if os.path.basename(fname) == 'coding.fa':
            meta = [x.strip('[]').split('=') for x in sfo['infostrs']]
            mdict = {m[0]: m[1] for m in meta if len(m) == 2}
            if 'gene' not in mdict:
                print 'no gene for %s' % sfo['infostrs']
                continue
            gene = mdict['gene']
        else:
            mdict = {}
            gene = sfo['name']
        if debug:
            print gene
        if utils.is_constant_gene(gene):
            if debug:
                print '  constant'
            continue
        region = utils.get_region(gene)
        utils.split_gene(gene)
        # if 'partial' in mdict:
        #     gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '')
        if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values():
            if debug:
                print '  duplicate'
            continue
        glseqs[utils.get_locus(gene)][region][gene] = sfo['seq']

    return glseqs
Exemple #16
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
Exemple #17
0
    def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False):
        # figure out what the new nukes are
        old_seq = self.glfo['seqs'][utils.get_region(gene)][gene]
        new_seq = old_seq
        mutfo = {}
        for pos in sorted(fitfo['candidates'][n_candidate_snps]):
            obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes}  # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations
            sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True)
            original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke']
            new_nuke = None
            for nuke, _ in sorted_obs_counts:  # take the most common one that isn't the existing gl nuke
                if nuke != original_nuke:
                    new_nuke = nuke
                    break
            print '   %3d  (%s --> %s)' % (pos, original_nuke, new_nuke),
            assert old_seq[pos] == original_nuke
            mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke}
            new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:]

        new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo)
        print ''
        print '          %s   %s' % (old_seq, utils.color_gene(gene))
        print '          %s   %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name))

        # and add it to the set of new alleles for this gene
        self.new_allele_info.append({
            'template-gene' : gene,
            'gene' : new_name,
            'seq' : new_seq,
            'aligned-seq' : None
        })
Exemple #18
0
def add_new_allele(glfo, newfo, remove_template_genes, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo['template-gene']
    region = utils.get_region(template_gene)
    if template_gene not in glfo['seqs'][region]:
        raise Exception('unknown template gene %s' % template_gene)

    new_gene = newfo['gene']

    if region == 'v':
        glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
    elif region == 'j':
        glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]

    glfo['seqs'][region][new_gene] = newfo['seq']

    if debug:
        print '    adding new allele to glfo:'
        print '      template %s   %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
        print '           new %s   %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
Exemple #19
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
Exemple #20
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == 'v'  # others not yet handled
    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True):
            snp_pos = random.randint(0, len(seq) - 1)  # note that randint() is inclusive
            tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print '        %3d   %s --> %s' % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base}

        seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :]

    assert utils.codon_unmutated('cyst', seq, cpos, debug=True)  # this is probably unnecessary
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
Exemple #21
0
    def plot(self, base_plotdir, only_csv=False):
        if not self.finalized:
            self.finalize(debug=debug)

        plotdir = base_plotdir + '/allele-finding'

        for old_gene_dir in glob.glob(plotdir + '/*'):  # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir
            if not os.path.isdir(old_gene_dir):
                raise Exception('not a directory: %s' % old_gene_dir)
            utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg'))
            os.rmdir(old_gene_dir)
        utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg'))

        if only_csv:  # not implemented
            return

        start = time.time()
        for gene in self.plotvals:
            if utils.get_region(gene) != 'v':
                continue

            for position in self.plotvals[gene]:
                if position not in self.fitted_positions[gene]:  # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow
                    continue
                # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None:
                #     continue
                plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position])
        print '      allele finding plot time: %.1f' % (time.time()-start)
Exemple #22
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0 or name.find('TR') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0 or to_state.find('TR') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
 async def send_message(self, update: dict):
     """
     Generates and sends a Discord message
     """
     android = update['android']
     codename = update['codename']
     device = update['device']
     filename = update['filename']
     filesize = update['size']
     version = update['version']
     download = update['download']
     branch = get_branch(version)
     region = get_region(filename, codename, version)
     rom_type = get_type(filename)
     codename = codename.split('_')[0]
     device = device.replace(f' {region}', '')
     desc = f"**Device**: {device} \n" \
         f"**Codename**: `{codename}` \n" \
         f"**Region**: {region} \n" \
         f"**Version**: `{version} | {android}` \n" \
         f"**Size**: {filesize} \n" \
         f"**Download**: [Here]({download})"
     embed = discord.Embed(title=f"New {branch} {rom_type} update available!",
                           color=discord.Colour.orange(), description=desc)
     embed.set_footer(text=f"https://xiaomifirmwareupdater.com/miui/{codename}")
     device = device.lower()
     for name, channel in self.channels.items():
         if device.startswith(name):
             await channel.send(embed=embed)
             print(f"Posted update for {codename} in Discord")
             return
     await self.channels['other'].send(embed=embed)
     print(f"Posted update for {codename} in Discord")
Exemple #24
0
def bundle(rootfs, size=10, filesystem='ext4'):
    log.debug('getting unique snapshot name')
    app = utils.get_turnkey_version(rootfs)
    snapshot_name = utils.get_uniquename(utils.get_region(), app + '.ebs')
    log.info('target snapshot - %s ', snapshot_name)

    log.info('creating volume, attaching, formatting and mounting')
    volume = Volume()
    volume.create(size)

    device = Device()
    volume.attach(utils.get_instanceid(), device)

    device.mkfs(filesystem)
    mount_path = rootfs + '.mount'
    device.mount(mount_path)

    log.info('syncing rootfs to volume')
    utils.rsync(rootfs, mount_path)

    device.umount()
    volume.detach()
    os.removedirs(mount_path)

    log.info('creating snapshot from volume')
    snapshot = Snapshot()
    snapshot.create(volume.vol.id, snapshot_name)
    volume.delete()

    log.info("complete - %s %s", snapshot.snap.id, snapshot.snap.description)
    return snapshot.snap.id, snapshot.snap.description
Exemple #25
0
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
    """
    Add a new allele to <glfo>, specified by <newfo> which is of the
    form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
    If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
    """

    template_gene = newfo["template-gene"]
    region = utils.get_region(template_gene)
    if template_gene not in glfo["seqs"][region]:
        raise Exception("unknown template gene %s" % template_gene)

    new_gene = newfo["gene"]

    if region == "v":
        glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene]
    elif region == "j":
        glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene]

    glfo["seqs"][region][new_gene] = newfo["seq"]

    if debug:
        print "    adding new allele to glfo:"
        print "      template %s   %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene))
        print "           new %s   %s" % (
            utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]),
            utils.color_gene(new_gene),
        )

    if remove_template_genes:
        remove_gene(glfo, template_gene, debug=True)
Exemple #26
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == 'v'  # others not yet handled
    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False):
            snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
            tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print '        %3d   %s --> %s' % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base}

        seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :]

    utils.check_conserved_cysteine(seq, cpos)
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
Exemple #27
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == "v"  # others not yet handled

    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True):
            snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
            tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print "        %3d   %s --> %s" % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {"original": seq[snp_pos], "new": new_base}

        seq = seq[:snp_pos] + new_base + seq[snp_pos + 1 :]

    assert utils.codon_ok("cyst", seq, cpos, debug=True)  # this is probably unnecessary
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {"template-gene": gene, "gene": snpd_name, "seq": seq}
Exemple #28
0
    def finalize(self, debug=False):
        assert not self.finalized

        self.mfreqer.finalize()

        start = time.time()
        gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()}
        if debug:
            print '\nlooking for new alleles:'
        for gene in sorted(self.mfreqer.counts):
            if utils.get_region(gene) != 'v':
                continue
            if debug:
                print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene]))

            positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug)
            if positions_to_try_to_fit is None:
                continue

            fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')}
            for istart in range(1, self.n_max_snps):
                if debug:
                    if istart == 1:
                        print '                                 resid. / ndof'
                        print '             position   ratio   (m=0 / m>%5.2f)       muted / obs ' % self.big_y_icpt_bounds[0]
                    print '  %d %s' % (istart, utils.plural_str('snp', istart))

                subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit}
                self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug)
                if istart not in fitfo['candidates']:  # if it didn't get filled, we didn't have enough observations to do the fit
                    break

            istart_candidates = []
            if debug:
                print '  evaluating each snp hypothesis'
                print '    snps       min ratio'
            for istart in fitfo['candidates']:
                if debug:
                    print '    %2d     %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])),
                if self.is_a_candidate(gene, fitfo, istart, debug=debug):
                    istart_candidates.append(istart)

            if len(istart_candidates) > 0:
                n_candidate_snps = min(istart_candidates)  # add the candidate with the smallest number of snps to the germline set, and run again (if the firs
                gene_results['new_allele'].add(gene)
                print '\n    found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps,
                                                                                                utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)),
                self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug)
            else:
                gene_results['didnt_find_anything_with_fit'].add(gene)
                if debug:
                    print '  no new alleles'

        if debug:
            print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])),
                                                                                                                                       len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit']))
            print '      allele finding time: %.1f' % (time.time()-start)

        self.finalized = True
Exemple #29
0
    def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = glfo['cyst-positions']
        self.tryp_positions = glfo['tryp-positions']

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
Exemple #30
0
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    if debug:
        print '  removing %s from glfo' % utils.color_gene(gene)
    region = utils.get_region(gene)
    if region in utils.conserved_codons:
        del glfo[utils.conserved_codons[region] + '-positions'][gene]
    del glfo['seqs'][region][gene]
Exemple #31
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seqfo in utils.read_fastx(fname):
        # first get gene name
        if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = seqfo['infostrs'][imgt_info_indices.index('gene')]
            functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = seqfo['name']
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))
        if gene in seqs[utils.get_region(gene)]:
            raise Exception('gene name %s appears twice in %s' % (gene, fname))

        # then the sequence
        seq = seqfo['seq']
        if not aligned:
            seq = utils.remove_gaps(seq)
        if 'Y' in seq:
            print '      replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene))
            seq = seq.replace('Y', 'N')
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [len(gl) for gl in seq_to_gene_map.values()]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print '  mutliple names in %s for the following sequences:' % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print '    %-50s   %s' % (' '.join(genelist), seq)
        raise Exception('please de-duplicate the fasta and re-run.')

    if n_skipped_pseudogenes > 0:
        print '    skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
Exemple #32
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        overall_plotdir = plotdir + '/overall'
        utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg'))
            # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))
        if self.tigger:
            utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg'))

        for gene in self.freqs:
            freqs = self.freqs[gene]
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme')  #, title=utils.sanitize_name(gene))
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err)
            xline = None
            figsize = [3, 3]
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = tryp_positions[gene]
                figsize[0] *= 2
            plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv)
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl

        # make mean mute freq hists
        plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)
        for region in utils.regions:
            plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)

        if self.tigger:
            self.tigger_plot(only_csv)

        if not only_csv:  # write html file and fix permissiions
            plotting.make_html(overall_plotdir)
            for region in utils.regions:
                plotting.make_html(plotdir + '/' + region, n_columns=1)
Exemple #33
0
def figure_out_which_damn_gene(germline_seqs, gene_name, seq, debug=False):
    region = utils.get_region(gene_name)
    seq = seq.replace(' ', '')
    if gene_name in germline_seqs[region]:  # already have it, but maybe when we added it before it was a shorter match, so substitute with the new longer match
        if len(seq) > len(germline_seqs[region][gene_name]):
            print '      gl match longer than gl!'
            print '       ', seq
            print '       ', germline_seqs[region][gene_name]
            germline_seqs[region][gene_name] = seq
        return gene_name
    candidates = []

    # if it doesn't specify an allele, see if any of the alleles we've got have the same sequence in the match region
    if gene_name.find('*') == -1:
        for candidate_gene in germline_seqs[region]:
            if candidate_gene.find(gene_name) == 0:
                if seq in germline_seqs[region][candidate_gene]:
                     candidates.append(candidate_gene)

    # if it *does* specify an allele, see if any of the other allele have the same sequence in the match region
    if len(candidates) == 0:  # didn't find anything... try other alleles
        for candidate_gene in germline_seqs[region]:
            if utils.are_alleles(candidate_gene, gene_name):
                if seq in germline_seqs[region][candidate_gene]:
                    candidates.append(candidate_gene)

    # sometimes it's 3-9, but sometimes 3-09. *grrrrrr*.
    if len(candidates) == 0:
        for candidate_gene in germline_seqs[region]:
            if gene_name.replace('-0', '-') == candidate_gene:
                if seq in germline_seqs[region][candidate_gene]:
                    candidates.append(candidate_gene)

    # try adding _F and _P to the end of j names
    if len(candidates) == 0:
        for candidate_gene in germline_seqs[region]:
            if gene_name + '_F' == candidate_gene or gene_name + '_P' == candidate_gene:
                if seq[ : len(germline_seqs[region][candidate_gene])] in germline_seqs[region][candidate_gene]:  # shorten <seq> to account for extra bases on right of imgt j versions
                    candidates.append(candidate_gene)

    # try removing the darn R at the end (and remove the zero). I hope it doesn't mean anything important
    if len(candidates) == 0:
        for candidate_gene in germline_seqs[region]:
            if gene_name.replace('R', '').replace('-0', '-') == candidate_gene:
                if seq in germline_seqs[region][candidate_gene]:
                    candidates.append(candidate_gene)
        
    if len(candidates) == 0:
        print '    ERROR didn\'t find jack for', gene_name, seq
        assert False
    # elif len(candidates) > 1:
    #     print 'NOTE found',len(candidates),'candidates, just using the first one'

    if debug:
        print '     swapping', gene_name, '-->', candidates[0]

    return candidates[0]
Exemple #34
0
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False):
    """
    Generate some snp'd genes and add them to glfo, specified with <snps_to_add>.
    e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location.
    The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G
    """

    templates_to_remove = set()

    added_snp_names = []
    for isnp in range(len(snps_to_add)):
        snpinfo = snps_to_add[isnp]
        gene, positions = snpinfo["gene"], snpinfo["positions"]
        print "    adding %d %s to %s" % (len(positions), utils.plural_str("snp", len(positions)), gene)
        seq = glfo["seqs"][utils.get_region(gene)][gene]
        assert utils.get_region(gene) == "v"
        cpos = glfo["cyst-positions"][gene]

        snpfo = None
        itry = 0
        while snpfo is None or snpfo["gene"] in glfo["seqs"][utils.get_region(gene)]:
            if itry > 0:
                print "      already in glfo, try again"
                if itry > 99:
                    raise Exception(
                        "too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?"
                    )
            snpfo = generate_snpd_gene(gene, cpos, seq, positions)
            itry += 1

        if remove_template_genes:
            templates_to_remove.add(gene)
        add_new_allele(
            glfo, snpfo, remove_template_genes=False, debug=debug
        )  # *don't* remove the templates here, since we don't know if there's another snp later that needs them
        added_snp_names.append(snpfo["gene"])

    remove_the_stupid_godamn_template_genes_all_at_once(
        glfo, templates_to_remove
    )  # works fine with zero-length <templates_to_remove>

    return (
        added_snp_names
    )  # need the order of the names so we can get allele prevalence freqs from the command line right
Exemple #35
0
    def process_query(self, bam, reads, perfplotter=None):
        primary = next((r for r in reads if not r.is_secondary), None)
        query_seq = primary.seq
        try:
            query_name = int(
                primary.qname
            )  # if it's just one of my hashes, we want it as an int
        except ValueError:
            query_name = primary.qname  # but if it's someone else's random-ass alphasymbolonumeric string we'll just leave it as-is
        raw_best = {}
        all_match_names = {}
        warnings = {}  # ick, this is a messy way to pass stuff around
        for region in utils.regions:
            all_match_names[region] = []
        all_query_bounds, all_germline_bounds = {}, {}
        for read in reads:  # loop over the matches found for each query sequence
            read.seq = query_seq  # only the first one has read.seq set by default, so we need to set the rest by hand
            gene = bam.references[read.tid]
            region = utils.get_region(gene)
            warnings[gene] = ''

            if region not in raw_best:  # best v, d, and j before multiplying by gene choice probs. needed 'cause *these* are the v and j that get excised
                raw_best[region] = gene

            raw_score = read.tags[0][
                1]  # raw because they don't include the gene choice probs
            score = raw_score
            if self.args.apply_choice_probs_in_sw:  # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well)
                score = self.get_choice_prob(
                    region, gene
                ) * raw_score  # multiply by the probability to choose this gene
            # set bounds
            qrbounds = (read.qstart, read.qend)
            glbounds = (read.pos, read.aend)

            assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0]

            assert qrbounds[1] <= len(query_seq)
            if glbounds[1] > len(self.germline_seqs[region][gene]):
                print '  ', gene
                print '  ', glbounds[1], len(self.germline_seqs[region][gene])
                print '  ', self.germline_seqs[region][gene]
            assert glbounds[1] <= len(self.germline_seqs[region][gene])

            assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0]

            all_match_names[region].append(
                (score, gene)
            )  # NOTE it is important that this is ordered such that the best match is first
            all_query_bounds[gene] = qrbounds
            all_germline_bounds[gene] = glbounds

        self.summarize_query(query_name, query_seq, raw_best, all_match_names,
                             all_query_bounds, all_germline_bounds,
                             perfplotter, warnings)
Exemple #36
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry 

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)})  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
Exemple #37
0
 def getvalstr(gene, val):
     if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
         return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
     else:
         if latex:
             gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
             if emph_genes is not None and gene in emph_genes:
                 gstr = '\\color{red}{\\textbf{%s}}' % gstr
         else:
             gstr = utils.color_gene(gene, width=18)
         return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
Exemple #38
0
    def fit_istart(self, gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=False):
        residuals = {}
        for pos in positions_to_try_to_fit:
            # skip positions that are too close to the 5' end of V (misassigned insertions look like snps)
            if pos > len(self.glfo['seqs'][utils.get_region(gene)][gene]) - self.n_five_prime_positions_to_exclude - 1:
                continue

            # as long as we already have a few non-candidate positions, skip positions that have no frequencies greater than the min y intercept (note that they could in principle still have a large y intercept, but we don't really care)
            if len(residuals) > istart + self.min_non_candidate_positions_to_fit and len([f for f in subxyvals[pos]['freqs'] if f > self.min_y_intercept]) == 0:
                continue

            if sum(subxyvals[pos]['total']) < self.n_total_min:
                continue

            # also skip positions that only have a few points to fit (i.e. genes that were very rare, or I guess maybe if they were always eroded past this position)
            if len(subxyvals[pos]['n_mutelist']) < 3:
                continue

            zero_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=(0. - self.small_number, 0. + self.small_number))
            big_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=self.big_y_icpt_bounds)

            residuals[pos] = {'zero_icpt' : zero_icpt_fit['residuals_over_ndof'], 'big_icpt' : big_icpt_fit['residuals_over_ndof']}

            self.fitted_positions[gene].add(pos)  # if we already did the fit for another <istart>, it'll already be in there

        if len(residuals) <= istart:  # needs to be at least one longer, so we have the first-non-snp
            if debug:
                print '      not enough observations to fit more than %d snps' % (istart - 1)
            return

        residual_ratios = {pos : float('inf') if r['big_icpt'] == 0. else r['zero_icpt'] / r['big_icpt'] for pos, r in residuals.items()}
        sorted_ratios = sorted(residual_ratios.items(), key=operator.itemgetter(1), reverse=True)  # sort the positions in decreasing order of residual ratio
        candidate_snps = [pos for pos, _ in sorted_ratios[:istart]]  # the first <istart> positions are the "candidate snps"
        max_non_snp, max_non_snp_ratio = sorted_ratios[istart]  # position and ratio for largest non-candidate
        min_candidate_ratio = min([residual_ratios[cs] for cs in candidate_snps])

        # fitfo['scores'][istart] = (min_candidate_ratio - max_non_snp_ratio) / max(self.small_number, max_non_snp_ratio)
        fitfo['min_snp_ratios'][istart] = min([residual_ratios[cs] for cs in candidate_snps])
        fitfo['candidates'][istart] = {cp : residual_ratios[cp] for cp in candidate_snps}

        if debug:
            # if debug > 1:
            #     print '%70s %s' % ('', ''.join(['%11d' % nm for nm in subxyvals[max_non_snp]['n_mutelist']]))
            for pos in candidate_snps + [max_non_snp, ]:
                xtrastrs = ('[', ']') if pos == max_non_snp else (' ', ' ')
                pos_str = '%3s' % str(pos)
                if residual_ratios[pos] > self.min_min_candidate_ratio:
                    pos_str = utils.color('yellow', pos_str)
                print '               %s %s    %5s   (%5s / %-5s)       %4d / %-4d %s' % (xtrastrs[0], pos_str, fstr(residual_ratios[pos]),
                                                                                       fstr(residuals[pos]['zero_icpt']), fstr(residuals[pos]['big_icpt']),
                                                                                       sum(subxyvals[pos]['obs']), sum(subxyvals[pos]['total']), xtrastrs[1]),
                # if debug > 1:
                #     print '      ', ''.join(['%4d / %-4d' % (subxyvals[pos]['obs'][inm], subxyvals[pos]['total'][inm]) for inm in range(len(subxyvals[pos]['n_mutelist']))])
                print ''
Exemple #39
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg'))
            # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme')  #, title=utils.sanitize_name(gene))
            for position in sorted_positions:
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err)
            xline = None
            figsize = [3, 3]
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
                figsize[0] *= 2
            plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv)
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl

        # make mean mute freq hists
        plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)
        for region in utils.regions:
            plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)

        if not only_csv:  # write html file and fix permissiions
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            for region in utils.regions:
                check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
                # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Exemple #40
0
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    region = utils.get_region(gene)
    if gene in glfo['seqs'][region]:
        if debug:
            print '  removing %s from glfo' % utils.color_gene(gene)
        del glfo['seqs'][region][gene]
        if region in utils.conserved_codons[glfo['locus']]:
            del glfo[utils.conserved_codons[glfo['locus']][region] + '-positions'][gene]
    else:
        if debug:
            print '  can\'t remove %s from glfo, it\'s not there' % utils.color_gene(gene)
Exemple #41
0
def remove_gene(glfo, gene, debug=False):
    """ remove <gene> from <glfo> """
    region = utils.get_region(gene)
    if gene in glfo["seqs"][region]:
        if debug:
            print "  removing %s from glfo" % utils.color_gene(gene)
        del glfo["seqs"][region][gene]
        if region in utils.conserved_codons[glfo["chain"]]:
            del glfo[utils.conserved_codons[glfo["chain"]][region] + "-positions"][gene]
    else:
        if debug:
            print "  can't remove %s from glfo, it's not there" % utils.color_gene(gene)
Exemple #42
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
Exemple #43
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
Exemple #44
0
 def finalize_tigger(self):
     utils.prep_dir(os.getenv('www') + '/partis/tmp', wildling='*.svg')
     for gene in self.counts:
         if utils.get_region(gene) != 'v':
             continue
         print '\n%s' % gene
         print ' position         x-icpt                  y-icpt                   slope              mut / total'
         mean_x_icpt = {'sum' : 0., 'total' : 0.}
         for position in sorted(self.counts[gene].keys()):
             self.freqs[gene][position]['tigger'] = self.tigger_calcs(position, self.counts[gene][position], mean_x_icpt)
         print mean_x_icpt
         if mean_x_icpt['total'] > 0.:
             print mean_x_icpt['sum'] / mean_x_icpt['total']
     assert False
     for gene in self.freqs:
         if utils.get_region(gene) != 'v':
             continue
         info = {p : self.freqs[gene][p]['tigger-fits'] for p in self.freqs[gene]}
         x_intercepts = [-v['intercept'] / v['slope'] for k, v in info.items() if v['intercept'] is not None and v['intercept'] < 0.3]
         print sorted(x_intercepts)
         print sum(x_intercepts) / float(len(x_intercepts))
         print numpy.median(x_intercepts)
Exemple #45
0
    def write_mute_freqs(
        self, gene, seq, reco_event, reco_seq_fname
    ):  # TODO unsurprisingly, this function profiles out to be kind of a dumb way to do it, in terms of run time
        """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """
        mute_freqs = self.get_mute_freqs(gene)

        rates = [
        ]  # list with a relative mutation rate for each position in <seq>
        total = 0.0
        # assert len(mute_freqs) == len(seq)  # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
        left_erosion_length = dict(reco_event.erosions.items() +
                                   reco_event.effective_erosions.items())[
                                       utils.get_region(gene) + '_5p']
        for inuke in range(len(seq)):  # append a freq for each nuke
            position = inuke + left_erosion_length
            freq = 0.0
            if position in mute_freqs:
                freq = mute_freqs[position]
            else:
                freq = mute_freqs['overall_mean']
            rates.append(freq)
            total += freq

        # normalize to the number of sites (i.e. so an average site is given value 1.0)
        assert total != 0.0  # I am not hip enough to divide by zero
        for inuke in range(len(seq)):
            rates[inuke] *= float(len(seq)) / total
        total = 0.0

        # and... double check it, just for shits and giggles
        for inuke in range(len(seq)):
            total += rates[inuke]
        assert utils.is_normed(total / float(len(seq)))
        assert len(rates) == len(
            seq
        )  # you just can't be too careful. what if gremlins ate a few while python wasn't looking?

        # write the input file for bppseqgen, one base per line
        with open(reco_seq_fname, 'w') as reco_seq_file:
            # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out)
            headstr = 'state'
            if not self.args.mutate_from_scratch:
                headstr += '\trate'
            reco_seq_file.write(headstr + '\n')
            for inuke in range(len(seq)):
                linestr = seq[inuke]
                if not self.args.mutate_from_scratch:
                    linestr += '\t%f' % rates[inuke]
                reco_seq_file.write(linestr + '\n')
Exemple #46
0
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False):
    """
    Generate some snp'd genes and add them to glfo, specified with <snps_to_add>.
    e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location.
    The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G
    """

    templates_to_remove = set()

    added_snp_names = []
    for isnp in range(len(snps_to_add)):
        snpinfo = snps_to_add[isnp]
        gene, positions = snpinfo['gene'], snpinfo['positions']
        print '    adding %d %s to %s' % (len(positions), utils.plural_str('snp', len(positions)), gene)
        seq = glfo['seqs'][utils.get_region(gene)][gene]
        assert utils.get_region(gene) == 'v'
        cpos = glfo['cyst-positions'][gene]

        snpfo = None
        itry = 0
        while snpfo is None or snpfo['gene'] in glfo['seqs'][utils.get_region(gene)]:
            if itry > 0:
                print '      already in glfo, try again'
                if itry > 99:
                    raise Exception('too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?')
            snpfo = generate_snpd_gene(gene, cpos, seq, positions)
            itry += 1

        if remove_template_genes:
            templates_to_remove.add(gene)
        add_new_allele(glfo, snpfo, remove_template_genes=False, debug=debug)  # *don't* remove the templates here, since we don't know if there's another snp later that needs them
        added_snp_names.append(snpfo['gene'])

    remove_the_stupid_godamn_template_genes_all_at_once(glfo, templates_to_remove)  # works fine with zero-length <templates_to_remove>

    return added_snp_names  # need the order of the names so we can get allele prevalence freqs from the command line right
Exemple #47
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r: {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print "%14.8f   %s" % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
Exemple #48
0
def read_allele_prevalence_freqs(fname, debug=False):
    # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
    allele_prevalence_freqs = {r : {} for r in utils.regions}
    with open(fname) as pfile:
        reader = csv.DictReader(pfile)
        for line in reader:
            allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq'])
    for region in utils.regions:
        if len(allele_prevalence_freqs[region]) == 0:
            continue
        if debug:
            for gene, freq in allele_prevalence_freqs[region].items():
                print '%14.8f   %s' % (freq, utils.color_gene(gene))
        assert utils.is_normed(allele_prevalence_freqs[region])
    return allele_prevalence_freqs
Exemple #49
0
 def get_allowed_genes(self, parameter_dir):
     allowed_genes = {}
     for region in [r for r in utils.regions if r != 'd']:
         genes_in_file = set()
         with open(parameter_dir + '/' + utils.get_parameter_fname(column=region + '_gene', deps=utils.column_dependencies[region + '_gene'])) as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
                 genes_in_file.add(line[region + '_gene'])
         allowed_genes[region] = genes_in_file
         if self.args.only_genes is not None:  # if --only-genes was specified, not only does the gene have to be in the parameter file, but it has to be among --only-genes
             regional_only_genes = set(g for g in self.args.only_genes if utils.get_region(g) == region)
             if len(regional_only_genes - genes_in_file) > 0:  # if command line asked for genes that aren't in the file
                 raise Exception('genes %s specified with --only-genes are not present in %s, so there\'s no information with which to simulate' % (' '.join(regional_only_genes - genes_in_file), parameter_dir))
             allowed_genes[region] &= regional_only_genes
     return allowed_genes
Exemple #50
0
def make_allele_finding_plot(plotdir, gene, position, values, xmax, fitfos=None):
    xmin, xmax = -0.3, xmax
    fig, ax = mpl_init()

    ax.errorbar(values['n_mutelist'], values['freqs'], yerr=values['errs'], markersize=15, linewidth=2, marker='.')  #, title='position ' + str(position))

    if fitfos is not None:  # fitted lines
        colors = {'prefo' : 'red', 'postfo' : 'red', 'onefo' : 'green'}
        for ftype in colors:
            if fitfos[ftype]['xvals'] is None:  # not really sure why this happens... probably zero-point fits?
                continue
            linevals = [fitfos[ftype]['slope']*x + fitfos[ftype]['y_icpt'] for x in fitfos[ftype]['xvals']]
            ax.plot(fitfos[ftype]['xvals'], linevals, color=colors[ftype])

    ax.plot([xmin, xmax], [0, 0], linestyle='dashed', alpha=0.5, color='black')
    ymax = max(values['freqs']) + max(values['errs'])
    mpl_finish(ax, plotdir, str(position), xlabel='mutations in %s segment' % utils.get_region(gene), ylabel='position\'s mut freq', xbounds=(xmin, xmax), ybounds=(-0.01, ymax), leg_loc=(0.95, 0.1), adjust={'right' : 0.85}, title='position ' + str(position) + ' in ' + gene)
Exemple #51
0
def make_allele_finding_plot(plotdir, gene, position, values, xmax, fitfos=None):
    xmin, xmax = -0.3, xmax
    fig, ax = mpl_init()

    ax.errorbar(values['n_mutelist'], values['freqs'], yerr=values['errs'], markersize=15, linewidth=2, marker='.')  #, title='position ' + str(position))

    if fitfos is not None:  # fitted lines
        colors = {'prefo' : 'red', 'postfo' : 'red', 'onefo' : 'green'}
        for ftype in colors:
            if fitfos[ftype]['xvals'] is None:  # not really sure why this happens... probably zero-point fits?
                continue
            linevals = [fitfos[ftype]['slope']*x + fitfos[ftype]['y_icpt'] for x in fitfos[ftype]['xvals']]
            ax.plot(fitfos[ftype]['xvals'], linevals, color=colors[ftype])

    ax.plot([xmin, xmax], [0, 0], linestyle='dashed', alpha=0.5, color='black')
    ymax = max(values['freqs']) + max(values['errs'])
    mpl_finish(ax, plotdir, str(position), xlabel='mutations in %s segment' % utils.get_region(gene), ylabel='position\'s mut freq', xbounds=(xmin, xmax), ybounds=(-0.01, ymax), leg_loc=(0.95, 0.1), adjust={'right' : 0.85}, title='position ' + str(position) + ' in ' + gene)
Exemple #52
0
def restrict_to_genes(glfo, only_genes, debug=False):
    """
    Remove from <glfo> any genes which are not in <only_genes>.
    Any regions which are not represented in in a non-None <only_genes> will be unrestricted (i.e. any gene from that region is fair game).
    """
    if only_genes is None:
        return

    restricted_regions = set([utils.get_region(g) for g in only_genes])
    unrestricted_regions = set(utils.regions) - restricted_regions

    only_genes_not_in_glfo = set(only_genes) - set([g for r in restricted_regions for g in glfo['seqs'][r]])
    if len(only_genes_not_in_glfo) > 0:
        print '  %s genes %s in <only_genes> aren\'t in glfo to begin with' % (utils.color('red', 'warning'), ' '.join(only_genes_not_in_glfo))

    genes_to_remove = set([g for r in restricted_regions for g in glfo['seqs'][r]]) - set(only_genes)
    if debug:
        print '    removing %d genes from glfo' % len(genes_to_remove)
    remove_genes(glfo, genes_to_remove)
Exemple #53
0
def make_mutefreq_plot(plotdir, gene_name, positions):
    import plotting
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    print utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin
        ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)

        total = 0.0
        alpha = 0.6
        for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
    def write_hmms(self, parameter_dir, sw_matches):
        print 'writing hmms with info from %s' % parameter_dir
        start = time.time()
        from hmmwriter import HmmWriter
        hmm_dir = parameter_dir + '/hmms'
        utils.prep_dir(hmm_dir, '*.yaml')

        gene_list = self.args.only_genes
        if gene_list == None:  # if specific genes weren't specified, do the ones for which we have matches
            gene_list = []
            for region in utils.regions:
                for gene in self.germline_seqs[region]:
                    if sw_matches == None or gene in sw_matches:  # shouldn't be None really, but I'm testing something
                        gene_list.append(gene)

        for gene in gene_list:
            if self.args.debug:
                print '  %s' % utils.color_gene(gene)
            writer = HmmWriter(
                parameter_dir, hmm_dir, gene, self.args.naivety,
                self.germline_seqs[utils.get_region(gene)][gene], self.args)
            writer.write()

        print '    time to write hmms: %.3f' % (time.time() - start)
    def run_bppseqgen(self,
                      seq,
                      chosen_tree,
                      gene_name,
                      reco_event,
                      seed,
                      is_insertion=False):
        """ Run bppseqgen on sequence

        Note that this is in general a piece of the full sequence (say, the V region), since
        we have different mutation models for different regions. Returns a list of mutated
        sequences.
        """
        region = ''
        if is_insertion:
            region = 'v'  # NOTE should really do something other than just use the v model for insertion mutations
        else:
            region = utils.get_region(gene_name)

        if len(seq) == 0:  # zero length insertion (or d)
            treg = re.compile('t[0-9][0-9]*')  # find number of leaf nodes
            n_leaf_nodes = len(treg.findall(chosen_tree))
            return ['' for _ in range(n_leaf_nodes)
                    ]  # return an empty string for each leaf node

        # write the tree to a tmp file
        if is_insertion:
            label = gene_name[:2]
        else:
            label = utils.get_region(gene_name)
        treefname = self.workdir + '/' + label + '-tree.tre'
        reco_seq_fname = self.workdir + '/' + label + '-start-seq.txt'
        leaf_seq_fname = self.workdir + '/' + label + '-leaf-seqs.fa'
        with opener('w')(treefname) as treefile:
            treefile.write(chosen_tree)
        self.write_mute_freqs(region,
                              gene_name,
                              seq,
                              reco_event,
                              reco_seq_fname,
                              is_insertion=is_insertion)

        # build up the command line
        # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google
        bpp_binary = os.getcwd() + '/packages/bpp/bin/bppseqgen'
        if not os.path.exists(bpp_binary):
            print 'ERROR bpp not found in %s' % os.path.dirname(bpp_binary)
            assert False
        command = 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:' + os.getcwd(
        ) + '/packages/bpp/lib\n'
        command += bpp_binary
        command += ' input.tree.file=' + treefname
        command += ' output.sequence.file=' + leaf_seq_fname
        command += ' number_of_sites=' + str(len(seq))
        command += ' input.tree.format=Newick'
        command += ' output.sequence.format=Fasta\(\)'
        command += ' alphabet=DNA'
        command += ' --seed=' + str(seed)
        command += ' model=GTR\('
        for par in self.mute_models[region]['gtr']:
            val = self.mute_models[region]['gtr'][par]
            command += par + '=' + val + ','
        command = command.rstrip(',')
        command += '\)'
        # NOTE should I use the "equilibrium frequencies" option?
        command += ' rate_distribution=\'Gamma(n=4,alpha=' + self.mute_models[
            region]['gamma']['alpha'] + ')\''
        command += ' input.infos.states=state'
        command += ' input.infos=' + reco_seq_fname
        command += ' input.infos.rates=rate'
        # print command
        check_output(command, shell=True)

        mutated_seqs = []
        for seq_record in SeqIO.parse(
                leaf_seq_fname, "fasta"
        ):  # get the leaf node sequences from the file that bppseqgen wrote
            mutated_seqs.append(str(seq_record.seq))

        # self.check_tree_simulation(leaf_seq_fname, chosen_tree)

        if not self.args.no_clean:
            os.remove(reco_seq_fname)  # clean up temp files
            os.remove(treefname)
            os.remove(leaf_seq_fname)

        return mutated_seqs
Exemple #56
0
def plot_single_variable(args, varname, hlist, outdir, pathnameclues):
    if varname in plotconfig.gene_usage_columns:
        hlist = plotting.add_bin_labels_not_in_all_hists(hlist)

    no_labels = False
    xline, bounds, figsize = None, None, None
    stats = args.extra_stats
    translegend = [0.0, -0.2]
    xtitle, ytitle = hlist[0].xtitle, hlist[0].ytitle
    if xtitle == '':  # arg, plotting.py thinks default should be None, hist.py thinks it's ''
        xtitle = None
    if '-mean-bins' in varname:
        raise Exception(
            'darn, I was hoping I wasn\'t making these plots any more')
    plottitle = plotconfig.plot_titles[
        varname] if varname in plotconfig.plot_titles else varname

    ytitle = 'frequency' if args.normalize else 'counts'

    if 'mute-freqs/v' in pathnameclues or 'mute-freqs/d' in pathnameclues or 'mute-freqs/j' in pathnameclues:
        assert not args.normalize
        ytitle = 'mutation freq'

    if varname in plotconfig.gene_usage_columns:
        xtitle = 'allele'
        if hlist[0].n_bins == 2:
            stats = ' 0-bin'  # print the fraction of entries in the zero bin into the legend (i.e. the fraction correct)
    # elif hlist[0].bin_labels.count('') == hlist[0].n_bins + 2:
    #     xtitle = '???'

    line_width_override = None
    if args.performance_plots:
        if 'hamming_to_true_naive' in varname:
            xtitle = 'hamming distance'
            if '_normed' in varname:
                xtitle = 'fractional ' + xtitle
        elif '_vs_mute_freq' in varname:
            xtitle = 'mutation freq'
            ytitle = 'fraction correct'
            if varname[0] == 'v' or varname[0] == 'j':
                translegend = [-0.4, -0.4]
        elif varname.find('_gene') == 1:
            xtitle = ''
            ytitle = 'fraction correct'
        else:
            xtitle = 'inferred - true'
        bounds = plotconfig.true_vs_inferred_hard_bounds.setdefault(
            varname, None)
    else:
        bounds = plotconfig.default_hard_bounds.setdefault(varname, None)
        if bounds is None and 'insertion' in varname:
            bounds = plotconfig.default_hard_bounds.setdefault(
                'all_insertions', None)
        if varname in plotconfig.gene_usage_columns:
            no_labels = True
            if 'j_' not in varname:
                figsize = (10, 5)
            line_width_override = 1
        elif 'per-gene-per-position/v' in pathnameclues:
            figsize = (20, 5)
            bounds = plotconfig.default_hard_bounds.setdefault(
                utils.unsanitize_name(varname), None)

    if 'IG' in varname or 'TR' in varname:
        if 'mute-freqs' in pathnameclues:
            gene = utils.unsanitize_name(varname)
            plottitle = gene  # + ' -- mutation frequency'
            xtitle = 'position'
            if utils.get_region(gene) == 'j':
                translegend = [0.1, 0.]  #(-0.35, -0.02)
            else:
                translegend = [0.15, -0.02]
            xline = None
            if args.glfo is not None:
                if utils.get_region(gene) in utils.conserved_codons[
                        args.locus]:
                    xline = args.glfo[utils.conserved_codons[args.locus][
                        utils.get_region(gene)] + '-positions'][gene]
        else:
            ilastdash = varname.rfind('-')
            gene = utils.unsanitize_name(varname[:ilastdash])
            base_varname = varname[ilastdash + 1:]
            base_plottitle = plotconfig.plot_titles[
                base_varname] if base_varname in plotconfig.plot_titles else ''
            plottitle = gene + ' -- ' + base_plottitle

    if len(hlist) > 9:  # skootch it down so they (maybe) all fit
        translegend[1] -= 0.5
    if args.translegend is not None:  # override with the command line
        translegend = args.translegend
    if args.extra_stats == 'auto':  # kind of hackey
        if xtitle == 'inferred - true':
            stats = 'absmean'
        else:
            stats = 'mean'
    # draw that little #$*(!
    linewidths = [
        line_width_override,
    ] if line_width_override is not None else args.linewidths
    alphas = [0.6 for _ in range(len(hlist))]
    plotting.draw_no_root(
        hlist[0],
        plotname=varname,
        plotdir=outdir,
        more_hists=hlist[1:],
        write_csv=False,
        stats=stats,
        bounds=bounds,
        shift_overflows=(os.path.basename(outdir) != 'gene-call'),
        plottitle=plottitle,
        colors=args.colors,
        xtitle=xtitle,
        ytitle=ytitle,
        xline=xline,
        normalize=(args.normalize and '_vs_mute_freq' not in varname),
        linewidths=linewidths,
        alphas=alphas,
        errors=True,
        figsize=figsize,
        no_labels=no_labels,
        log=args.log,
        translegend=translegend)
Exemple #57
0
        if opt == "--region":
            region = val

        if opt == "--size":
            kwargs['size'] = int(val)

        if opt == "--name":
            kwargs['name'] = val

        if opt == "--desc":
            kwargs['desc'] = val

    if len(args) != 2:
        usage("incorrect number of arguments")

    snapshot_id = args[0]
    virt = args[1]
    arch = arch if arch else utils.get_arch()
    region = region if region else utils.get_region()

    if not virt in ('hvm', 'pvm'):
        fatal("virtualization type not supported: %s" % virt)

    ami_id, ami_name = register(snapshot_id, region, virt, arch, **kwargs)

    print ami_id, ami_name


if __name__ == "__main__":
    main()
Exemple #58
0
    tree = input_tfile.Get('Nominal')
    entries = tree.GetEntries()
    for entry in range(entries):
        if entry % 10000 == 0:
            print('*** processed {0} out of {1}'.format(entry, entries))
        # get the next tree in the chain and verify
        ientry = tree.LoadTree(entry)
        if ientry < 0:
            break
        # copy next entry into memory and verify
        nb = tree.GetEntry(entry)
        if nb <= 0:
            continue

        for idv, nTracks in enumerate(tree.DV_nTracks):
            rIndex = utils.get_region(tree, idv)
            if rIndex < 0:
                continue
            if not tree.DV_passFidCuts[idv] or not tree.DV_passChisqCut[
                    idv] or not tree.DV_passDistCut[idv]:
                continue
            if nTracks == 2:
                h_mass_2[rIndex].Fill(tree.DV_m[idv])
            elif nTracks == 3:
                get_2track_mass_from_3track_dv(tree, idv,
                                               h_mass_2_in_3[rIndex])
            else:
                continue
            #print('3-track mass = ' + str(tree.DV_m[idv]))
    output_tfile = TFile('output_ks_method.root', 'recreate')
    for region in range(12):
Exemple #59
0
 def __init__(self, region=None):
     self.region = region if region else utils.get_region()
     self.conn = utils.connect(self.region)
     self.snap = None
Exemple #60
0
 def __init__(self, region=None):
     self.region = region if region else utils.get_region()
     self.conn = utils.connect(self.region)
     self.vol = None
     self.device = None