target_loci = None if o.fn_target_loci != None: target_loci = [] for l in open(o.fn_target_loci): c, s, e = l.rstrip().split() s, e = int(s), int(e) if c == contig: target_loci.append([c, s, e]) tbx_dups = pysam.Tabixfile(o.fn_dup_tabix) callset_clust = cluster.cluster_callsets(o.fn_call_table, contig) g = gt.genotyper(contig, gglob_dir=o.gglob_dir, plot_dir=o.out_viz_dir, subset_indivs=subset_indivs, fn_fa=o.fn_fa, dup_tabix=tbx_dups, GC_inf=GC_inf) fn_sunk_gt_out = o.fn_gt_out.replace(".genotypes", ".sunk_genotypes") F_gt = open(o.fn_gt_out, 'w') F_sunk_gt = open(fn_sunk_gt_out, 'w') F_VCF = open(o.fn_vcf_out, 'w') F_call = open(o.fn_call_out, 'w') FINF = open("%s.info" % o.fn_call_out, 'w') info_ob = info_io.info_io(FINF) g.setup_output(F_gt, F_sunk_gt, F_VCF, F_call, info_ob)
if o.fn_target_loci != None: target_loci = [] for l in open(o.fn_target_loci): c, s, e = l.rstrip().split() s, e = int(s), int(e) if c == contig: target_loci.append([c, s, e]) tbx_dups = pysam.Tabixfile(o.fn_dup_tabix) callset_clust = cluster.cluster_callsets(o.fn_call_table, contig) g = gt.genotyper( contig, gglob_dir=o.gglob_dir, plot_dir=o.out_viz_dir, subset_indivs=subset_indivs, fn_fa=o.fn_fa, dup_tabix=tbx_dups, GC_inf=GC_inf, ) fn_sunk_gt_out = o.fn_gt_out.replace(".genotypes", ".sunk_genotypes") F_gt = open(o.fn_gt_out, "w") F_sunk_gt = open(fn_sunk_gt_out, "w") F_VCF = open(o.fn_vcf_out, "w") F_call = open(o.fn_call_out, "w") FINF = open("%s.info" % o.fn_call_out, "w") info_ob = info_io.info_io(FINF)
def resolve_overlapping_clusters(self, ll_cutoff, tbx_dups, indiv_id, indiv_DTS, ref_DTSs, dCGHs, gglob_dir, out_viz_dir, verbose=False, min_overlapping=2, subset_indivs=None, min_d=0): """ resolve overlapping clusters into individual calls let the calls have likelihoods 1. do the recip overlap cluster - clusters very similar calls w/ similar break-points 2. make sure there are at least 3 calls in there (w/ similar breakpoints) 3. make sure those calls sum to a log likelihood of <3 4. collapse overlaps 5. get the best breakpoints """ print("resolving breakpoints...", file=stderr) final_calls = [] for chr, overlapping_calls in self.overlapping_calls_by_chr.items(): print(chr, "%d calls in this chr" % (len(overlapping_calls)), file=stderr) indiv_cps = indiv_DTS.get_cps_by_chr(chr) ref_cps = {} for ref, refDTS in ref_DTSs.items(): ref_cps[ref] = refDTS.get_cps_by_chr(chr) g = genotyper(chr, gglob_dir=gglob_dir, plot_dir=out_viz_dir, subset_indivs=subset_indivs) curr_dCGHs = self.get_curr_chr_dCGHs(chr, dCGHs) wnd_starts, wnd_ends = indiv_DTS.get_wnds_by_chr(chr) t = time.time() n_assessed = -1 for overlap_cluster in overlapping_calls: overlap_cutoff = 0.85 #overlap_cutoff = 0.75 n_assessed += 1 resolved_calls = overlap_cluster.overlap_resolve( overlap_cutoff, ll_cutoff, tbx_dups, min_overlapping=min_overlapping) if len(resolved_calls) == 0: continue variable_clusts = [] for clust in resolved_calls: """ now take all these resolved calls, and genotype to ensure this seg is var """ d = self.get_delta(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps) if clust.size == 1: print("skipping single call cluster") continue if d > 1.0: variable_clusts.append(clust) elif d > min_d: X, idx_s, idx_e = g.get_gt_matrix( chr, clust.get_med_start(), clust.get_med_end()) print(X) gX = g.GMM_genotype(X) if gX.gmm.n_components > 1 and gX.is_var(indiv_id, g): variable_clusts.append(clust) overlapping_call_clusts = get_overlapping_call_clusts( variable_clusts) for clust in overlapping_call_clusts: final_call = self.get_final_call(clust) final_calls.append(final_call) """ if min(overlap_cluster.all_starts)>14454554 and min(overlap_cluster.all_starts)<14933135: self.plot_call(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps) raw_input() """ #overlapping_call_clusts = get_overlapping_call_clusts(resolved_calls, flatten = True) print("n-final_calls:%d n_assessed:%d time_elapsed:%fs" % (len(final_calls), n_assessed, time.time() - t)) print("done", file=stderr) print("%d calls with likelihood <%f" % (len(final_calls), ll_cutoff), file=stderr) return final_calls
#colors = ["red", "chocolate", "darkgreen", "gold", "darkcyan", "darkblue", "darkorchid"] + list(plt.cm.Greys(np.linspace(0.5, 1, 2))) elif args.pop_color_scheme == "primate": label = "species_label" n_colors = len(pop_data[label].unique().tolist()) colors = ["royalblue"] + list(plt.cm.Greys(np.linspace(1, 0.25, 4))) else: if args.custom_label is None: print "Must specify custom label for custom color scheme" sys.exit(1) label = args.custom_label n_colors = len(pop_data[label].unique().tolist()) colors = list(plt.cm.hsv(np.linspace(0, 0.9, n_colors-2))) + list(plt.cm.Greys(np.linspace(0.5, 1, 2))) indivs = pop_data.sample.unique().tolist() g = gt.genotyper(args.contig, gglob_dir = args.gglob_dir, plot_dir = args.plot_dir, subset_indivs = indivs, fn_fa=args.fn_fa, dup_tabix = args.fn_dup_tabix, GC_inf = args.fn_GC_DTS) X, idx_s, idx_e = g.get_gt_matrix(args.contig, args.start - args.pad, args.end + args.pad) starts = g.wnd_starts[idx_s:idx_e] data = pd.DataFrame(X, index = g.indivs, columns = map(str, starts)) pop_data = pop_data.loc[pop_data["sample"].isin(g.indivs)] unique_labels = pop_data[label].unique().tolist() pop_data["color"] = pop_data[label].map(lambda x: colors[unique_labels.index(x)]) pop_data["label"] = pop_data[label] alpha = 0.6 fig = plt.figure(figsize=(24,15)) if args.genes is not None: genes_h = 0.1
GC_inf = GC_data(args.fn_GC_DTS, args.contig, args.fn_DTS_contigs) if args.subset_indivs is not None: indivs = args.subset_indivs elif args.manifest is not None: indivs = pd.read_table(args.manifest, header=0).sample.unique().tolist() else: indivs = list(pd.read_json("%s/gglob.idx" % args.gglob_dir).indivs) # GENOTYPE TIME! g = gt.genotyper(args.contig, gglob_dir=args.gglob_dir, plot_dir=args.plot_dir, subset_indivs=indivs, fn_fa=args.fn_fa, dup_tabix=tbx_dups, GC_inf=GC_inf) regions = pd.read_csv(args.fn_regions, header=None, delimiter="\t", index_col=None) regions.columns = ["chr", "start", "end", "name"] regions_by_contig = regions[regions['chr'] == args.contig] nregions = regions_by_contig.shape[0] FOUT = open(args.fn_out, 'w') if args.contig == args.header_chr and subset == 0: FOUT.write("chr\tstart\tend\tname\t%s\n" % ("\t".join(indivs)))
def resolve_overlapping_clusters(self, ll_cutoff, tbx_dups, indiv_id, indiv_DTS, ref_DTSs, dCGHs, gglob_dir, out_viz_dir, verbose=False, min_overlapping=2, subset_indivs=None, min_d=0): """ resolve overlapping clusters into individual calls let the calls have likelihoods 1. do the recip overlap cluster - clusters very similar calls w/ similar break-points 2. make sure there are at least 3 calls in there (w/ similar breakpoints) 3. make sure those calls sum to a log likelihood of <3 4. collapse overlaps 5. get the best breakpoints """ print >>stderr, "resolving breakpoints..." final_calls = [] for chr, overlapping_calls in self.overlapping_calls_by_chr.iteritems(): print >>stderr, chr, "%d calls in this chr"%(len(overlapping_calls)) indiv_cps = indiv_DTS.get_cps_by_chr(chr) ref_cps = {} for ref, refDTS in ref_DTSs.iteritems(): ref_cps[ref] = refDTS.get_cps_by_chr(chr) g = genotyper(chr, gglob_dir=gglob_dir, plot_dir=out_viz_dir, subset_indivs = subset_indivs) curr_dCGHs = self.get_curr_chr_dCGHs(chr, dCGHs) wnd_starts, wnd_ends = indiv_DTS.get_wnds_by_chr(chr) t=time.time() n_assessed=-1 for overlap_cluster in overlapping_calls: overlap_cutoff = 0.85 #overlap_cutoff = 0.75 n_assessed+=1 resolved_calls = overlap_cluster.overlap_resolve(overlap_cutoff, ll_cutoff, tbx_dups, min_overlapping=min_overlapping) if len(resolved_calls) == 0: continue variable_clusts = [] for clust in resolved_calls: """ now take all these resolved calls, and genotype to ensure this seg is var """ d = self.get_delta(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps) if clust.size == 1: print "skipping single call cluster" continue if d > 1.0: variable_clusts.append(clust) elif d>min_d: X, idx_s, idx_e = g.get_gt_matrix(chr, clust.get_med_start(), clust.get_med_end()) gX = g.GMM_genotype(X) if gX.gmm.n_components>1 and gX.is_var(indiv_id, g): variable_clusts.append(clust) overlapping_call_clusts = get_overlapping_call_clusts(variable_clusts) for clust in overlapping_call_clusts: final_call = self.get_final_call(clust) final_calls.append(final_call) """ if min(overlap_cluster.all_starts)>14454554 and min(overlap_cluster.all_starts)<14933135: self.plot_call(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps) raw_input() """ #overlapping_call_clusts = get_overlapping_call_clusts(resolved_calls, flatten = True) print "n-final_calls:%d n_assessed:%d time_elapsed:%fs"%(len(final_calls), n_assessed, time.time()-t) print >>stderr, "done" print >>stderr, "%d calls with likelihood <%f"%(len(final_calls), ll_cutoff) return final_calls