Example #1
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Example #2
0
    def get_alleles(self, swfo, plotdir=None, debug=False):
        print 'clustering for new alleles'

        # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>)
        default_initial_glfo = self.glfo
        if self.args.default_initial_germline_dir is not None:  # if this is set, we want to take any new allele names from this directory's glfo if they're in there
            default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus'])
            glfo_to_modify = copy.deepcopy(default_initial_glfo)  # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously
        else:
            print '  %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning')

        qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug)
        if qr_seqs is None:
            return {}

        # self.check_for_donuts(debug=debug)

        if not self.args.kmeans_allele_cluster:
            clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug)
        else:
            clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug)
            msa_info = clusterfos

        # and finally loop over each cluster, deciding if it corresponds to a new allele
        if debug:
            print '  looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos]))
            print '   rank  seqs   v/j mfreq                 seqs      snps (%s)' % utils.color('blue', 'indels')
        new_alleles = {}
        n_existing_gene_clusters = 0
        for iclust in range(len(clusterfos)):
            clusterfo = clusterfos[iclust]

            # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)]
            # mean_dot_product = numpy.average(dot_products)

            # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter)
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)
            template_gene, template_counts = sorted_glcounts[0]
            template_seq = self.glfo['seqs'][self.region][template_gene]
            template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)

            assert '.' not in clusterfo['cons_seq']  # make sure you haven't switched to something that doesn't use '-' for gap chars
            new_seq = clusterfo['cons_seq'].replace('-', '')  # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em

            aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq'])
            has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-')  # only counts internal indels
            cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs}  # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene
            mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs}

            equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos)
            if equiv_name is not None:
                new_name = equiv_name
                new_seq = equiv_seq
            else:
                new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None)  # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info

            if debug:
                self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels)

            if new_name in self.glfo['seqs'][self.region]:  # note that this only looks in <self.glfo>, not in <new_alleles>
                n_existing_gene_clusters += 1
                if debug:
                    print 'existing %s' % utils.color_gene(new_name)
                continue

            if new_name in new_alleles:  # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them)
                if debug:
                    print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new'))
                continue
            assert new_seq not in new_alleles.values()  # if it's the same seq, it should've got the same damn name

            if not has_indels:  # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels
                if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug):  # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template
                    continue

                if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.:
                    this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']
                    overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j']
                    if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold:
                        if debug:
                            print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold)
                        continue

            if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug):  # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene
                continue

            print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '')
            new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq}
            if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]:  # if it's in <default_initial_glfo> it'll already be in there
                glutils.add_new_allele(glfo_to_modify, new_alleles[new_name])  # just so we can check for equivalency

        if debug:
            print '  %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info))

        self.reassign_template_counts(msa_info, new_alleles, debug=False)
        for new_name, newfo in new_alleles.items():
            # print '%s  %s  %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())))
            if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction:  # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover
                newfo['remove-template-gene'] = True

        return new_alleles
Example #3
0
for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
    print '%s:' % utils.color('yellow', name)
    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))

for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
    aset, bset = [set(g['seqs'][region]) for g in glfos]

    tmpfo = glutils.get_empty_glfo(
        args.locus)  # make a new glfo that will only have non-shared genes
    for glabel, gset, gfo in zip(
            args.names, [aset - bset, bset - aset],
            glfos):  # <gset> is the genes that're only in <glabel>
        for ogene in gset:
            glutils.add_new_allele(tmpfo, {
                'gene': '+'.join([ogene, glabel]),
                'seq': gfo['seqs'][region][ogene],
                'cpos': utils.cdn_pos(gfo, region, ogene)
            },
                                   use_template_for_codon_info=False)

    # eh, maybe this doesn't really add anything?
    # # add the nearest genes that they both have for comparison NOTE this gives one comparison gene for *each* gene, so usually you get a bunch of comparison/'both' genes in each block in the ascii output
    # for bgene in aset & bset:
    #     _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfos[0], glfos[0]['seqs'][region][bgene], new_cpos=utils.cdn_pos(glfos[0], region, bgene))  # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one
    #     glutils.add_new_allele(tmpfo, {'gene' : '+'.join([nearest_gene, 'both']), 'seq' : glfos[0]['seqs'][region][nearest_gene], 'cpos' : utils.cdn_pos(glfos[0], region, bgene)}, use_template_for_codon_info=False)

    print '%s: only in:\n      %12s: %2d  %s\n      %12s: %2d  %s' % (
        utils.color('green', region), args.names[0], len(aset - bset),
        utils.color_genes(sorted(aset - bset)), args.names[1],
        len(bset - aset), utils.color_genes(sorted(bset - aset)))
    if len(tmpfo['seqs'][region]) > 0:
        print ' comparing to nearest genes that were in both (labeled \'both\'):'
Example #4
0
for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
    print '%s:' % utils.color('yellow', name)
    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))

for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
    tmp_glfo = copy.deepcopy(
        glfos[0])  # make a new glfo that will only have non-shared genes

    # first remove any that are in both
    for gene, seq in tmp_glfo['seqs'][region].items():
        if gene not in glfos[1]['seqs'][
                region]:  # keep it, under a new name, if it's not in <glfos[1]>
            glutils.add_new_allele(
                tmp_glfo, {
                    'gene': '+'.join([gene, args.names[0]]),
                    'seq': seq,
                    'template-gene': gene
                }
            )  # add an extra str to the name so we know which one it came from
        glutils.remove_gene(tmp_glfo, gene)

    # then add any that are only in the second one
    for gene, seq in glfos[1]['seqs'][region].items():
        if gene not in glfos[0]['seqs'][region]:
            cpos = glfos[1][utils.cdn(glfos[1], region) +
                            '-positions'][gene] if utils.cdn(
                                glfos[1], region) is not None else None
            glutils.add_new_allele(
                tmp_glfo, {
                    'gene': '+'.join([gene, args.names[1]]),
                    'seq': seq,