def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def get_alleles(self, swfo, plotdir=None, debug=False): print 'clustering for new alleles' # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>) default_initial_glfo = self.glfo if self.args.default_initial_germline_dir is not None: # if this is set, we want to take any new allele names from this directory's glfo if they're in there default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus']) glfo_to_modify = copy.deepcopy(default_initial_glfo) # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously else: print ' %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning') qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug) if qr_seqs is None: return {} # self.check_for_donuts(debug=debug) if not self.args.kmeans_allele_cluster: clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug) else: clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug) msa_info = clusterfos # and finally loop over each cluster, deciding if it corresponds to a new allele if debug: print ' looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos])) print ' rank seqs v/j mfreq seqs snps (%s)' % utils.color('blue', 'indels') new_alleles = {} n_existing_gene_clusters = 0 for iclust in range(len(clusterfos)): clusterfo = clusterfos[iclust] # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)] # mean_dot_product = numpy.average(dot_products) # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter) sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo) template_gene, template_counts = sorted_glcounts[0] template_seq = self.glfo['seqs'][self.region][template_gene] template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene) assert '.' not in clusterfo['cons_seq'] # make sure you haven't switched to something that doesn't use '-' for gap chars new_seq = clusterfo['cons_seq'].replace('-', '') # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq']) has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-') # only counts internal indels cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs} # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs} equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos) if equiv_name is not None: new_name = equiv_name new_seq = equiv_seq else: new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None) # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info if debug: self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels) if new_name in self.glfo['seqs'][self.region]: # note that this only looks in <self.glfo>, not in <new_alleles> n_existing_gene_clusters += 1 if debug: print 'existing %s' % utils.color_gene(new_name) continue if new_name in new_alleles: # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them) if debug: print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new')) continue assert new_seq not in new_alleles.values() # if it's the same seq, it should've got the same damn name if not has_indels: # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug): # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template continue if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.: this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j'] overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j'] if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold: if debug: print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold) continue if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug): # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene continue print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '') new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq} if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]: # if it's in <default_initial_glfo> it'll already be in there glutils.add_new_allele(glfo_to_modify, new_alleles[new_name]) # just so we can check for equivalency if debug: print ' %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info)) self.reassign_template_counts(msa_info, new_alleles, debug=False) for new_name, newfo in new_alleles.items(): # print '%s %s %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values()))) if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction: # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover newfo['remove-template-gene'] = True return new_alleles
for name, gldir in zip(args.names, [args.gldir1, args.gldir2]): print '%s:' % utils.color('yellow', name) glfos.append(glutils.read_glfo(gldir, args.locus, debug=True)) for region in [r for r in utils.regions if r in glfos[0]['seqs']]: aset, bset = [set(g['seqs'][region]) for g in glfos] tmpfo = glutils.get_empty_glfo( args.locus) # make a new glfo that will only have non-shared genes for glabel, gset, gfo in zip( args.names, [aset - bset, bset - aset], glfos): # <gset> is the genes that're only in <glabel> for ogene in gset: glutils.add_new_allele(tmpfo, { 'gene': '+'.join([ogene, glabel]), 'seq': gfo['seqs'][region][ogene], 'cpos': utils.cdn_pos(gfo, region, ogene) }, use_template_for_codon_info=False) # eh, maybe this doesn't really add anything? # # add the nearest genes that they both have for comparison NOTE this gives one comparison gene for *each* gene, so usually you get a bunch of comparison/'both' genes in each block in the ascii output # for bgene in aset & bset: # _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfos[0], glfos[0]['seqs'][region][bgene], new_cpos=utils.cdn_pos(glfos[0], region, bgene)) # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one # glutils.add_new_allele(tmpfo, {'gene' : '+'.join([nearest_gene, 'both']), 'seq' : glfos[0]['seqs'][region][nearest_gene], 'cpos' : utils.cdn_pos(glfos[0], region, bgene)}, use_template_for_codon_info=False) print '%s: only in:\n %12s: %2d %s\n %12s: %2d %s' % ( utils.color('green', region), args.names[0], len(aset - bset), utils.color_genes(sorted(aset - bset)), args.names[1], len(bset - aset), utils.color_genes(sorted(bset - aset))) if len(tmpfo['seqs'][region]) > 0: print ' comparing to nearest genes that were in both (labeled \'both\'):'
for name, gldir in zip(args.names, [args.gldir1, args.gldir2]): print '%s:' % utils.color('yellow', name) glfos.append(glutils.read_glfo(gldir, args.locus, debug=True)) for region in [r for r in utils.regions if r in glfos[0]['seqs']]: tmp_glfo = copy.deepcopy( glfos[0]) # make a new glfo that will only have non-shared genes # first remove any that are in both for gene, seq in tmp_glfo['seqs'][region].items(): if gene not in glfos[1]['seqs'][ region]: # keep it, under a new name, if it's not in <glfos[1]> glutils.add_new_allele( tmp_glfo, { 'gene': '+'.join([gene, args.names[0]]), 'seq': seq, 'template-gene': gene } ) # add an extra str to the name so we know which one it came from glutils.remove_gene(tmp_glfo, gene) # then add any that are only in the second one for gene, seq in glfos[1]['seqs'][region].items(): if gene not in glfos[0]['seqs'][region]: cpos = glfos[1][utils.cdn(glfos[1], region) + '-positions'][gene] if utils.cdn( glfos[1], region) is not None else None glutils.add_new_allele( tmp_glfo, { 'gene': '+'.join([gene, args.names[1]]), 'seq': seq,