target_loci = None

    if o.fn_target_loci != None:
        target_loci = []
        for l in open(o.fn_target_loci):
            c, s, e = l.rstrip().split()
            s, e = int(s), int(e)
            if c == contig: target_loci.append([c, s, e])

    tbx_dups = pysam.Tabixfile(o.fn_dup_tabix)
    callset_clust = cluster.cluster_callsets(o.fn_call_table, contig)
    g = gt.genotyper(contig,
                     gglob_dir=o.gglob_dir,
                     plot_dir=o.out_viz_dir,
                     subset_indivs=subset_indivs,
                     fn_fa=o.fn_fa,
                     dup_tabix=tbx_dups,
                     GC_inf=GC_inf)

    fn_sunk_gt_out = o.fn_gt_out.replace(".genotypes", ".sunk_genotypes")

    F_gt = open(o.fn_gt_out, 'w')
    F_sunk_gt = open(fn_sunk_gt_out, 'w')
    F_VCF = open(o.fn_vcf_out, 'w')
    F_call = open(o.fn_call_out, 'w')
    FINF = open("%s.info" % o.fn_call_out, 'w')

    info_ob = info_io.info_io(FINF)

    g.setup_output(F_gt, F_sunk_gt, F_VCF, F_call, info_ob)
    if o.fn_target_loci != None:
        target_loci = []
        for l in open(o.fn_target_loci):
            c, s, e = l.rstrip().split()
            s, e = int(s), int(e)
            if c == contig:
                target_loci.append([c, s, e])

    tbx_dups = pysam.Tabixfile(o.fn_dup_tabix)
    callset_clust = cluster.cluster_callsets(o.fn_call_table, contig)
    g = gt.genotyper(
        contig,
        gglob_dir=o.gglob_dir,
        plot_dir=o.out_viz_dir,
        subset_indivs=subset_indivs,
        fn_fa=o.fn_fa,
        dup_tabix=tbx_dups,
        GC_inf=GC_inf,
    )

    fn_sunk_gt_out = o.fn_gt_out.replace(".genotypes", ".sunk_genotypes")

    F_gt = open(o.fn_gt_out, "w")
    F_sunk_gt = open(fn_sunk_gt_out, "w")
    F_VCF = open(o.fn_vcf_out, "w")
    F_call = open(o.fn_call_out, "w")
    FINF = open("%s.info" % o.fn_call_out, "w")

    info_ob = info_io.info_io(FINF)
Esempio n. 3
0
    def resolve_overlapping_clusters(self,
                                     ll_cutoff,
                                     tbx_dups,
                                     indiv_id,
                                     indiv_DTS,
                                     ref_DTSs,
                                     dCGHs,
                                     gglob_dir,
                                     out_viz_dir,
                                     verbose=False,
                                     min_overlapping=2,
                                     subset_indivs=None,
                                     min_d=0):
        """
        resolve overlapping clusters into individual calls 
        let the calls have likelihoods
            1. do the recip overlap cluster - clusters very similar calls w/ similar break-points
            2. make sure there are at least 3 calls in there (w/ similar breakpoints)
            3. make sure those calls sum to a log likelihood of <3
            4. collapse overlaps 
            5. get the best breakpoints
        """
        print("resolving breakpoints...", file=stderr)
        final_calls = []

        for chr, overlapping_calls in self.overlapping_calls_by_chr.items():
            print(chr,
                  "%d calls in this chr" % (len(overlapping_calls)),
                  file=stderr)

            indiv_cps = indiv_DTS.get_cps_by_chr(chr)
            ref_cps = {}
            for ref, refDTS in ref_DTSs.items():
                ref_cps[ref] = refDTS.get_cps_by_chr(chr)

            g = genotyper(chr,
                          gglob_dir=gglob_dir,
                          plot_dir=out_viz_dir,
                          subset_indivs=subset_indivs)

            curr_dCGHs = self.get_curr_chr_dCGHs(chr, dCGHs)
            wnd_starts, wnd_ends = indiv_DTS.get_wnds_by_chr(chr)

            t = time.time()
            n_assessed = -1
            for overlap_cluster in overlapping_calls:
                overlap_cutoff = 0.85
                #overlap_cutoff = 0.75
                n_assessed += 1

                resolved_calls = overlap_cluster.overlap_resolve(
                    overlap_cutoff,
                    ll_cutoff,
                    tbx_dups,
                    min_overlapping=min_overlapping)

                if len(resolved_calls) == 0: continue

                variable_clusts = []
                for clust in resolved_calls:
                    """
                    now take all these resolved calls,  
                    and genotype to ensure this seg is var
                    """
                    d = self.get_delta(clust, wnd_starts, wnd_ends, curr_dCGHs,
                                       indiv_cps, ref_cps)
                    if clust.size == 1:
                        print("skipping single call cluster")
                        continue
                    if d > 1.0:
                        variable_clusts.append(clust)
                    elif d > min_d:
                        X, idx_s, idx_e = g.get_gt_matrix(
                            chr, clust.get_med_start(), clust.get_med_end())
                        print(X)
                        gX = g.GMM_genotype(X)
                        if gX.gmm.n_components > 1 and gX.is_var(indiv_id, g):
                            variable_clusts.append(clust)

                overlapping_call_clusts = get_overlapping_call_clusts(
                    variable_clusts)

                for clust in overlapping_call_clusts:
                    final_call = self.get_final_call(clust)
                    final_calls.append(final_call)
                """
                if min(overlap_cluster.all_starts)>14454554 and min(overlap_cluster.all_starts)<14933135:
                self.plot_call(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps)
                raw_input()
                """
                #overlapping_call_clusts = get_overlapping_call_clusts(resolved_calls, flatten = True)
            print("n-final_calls:%d n_assessed:%d time_elapsed:%fs" %
                  (len(final_calls), n_assessed, time.time() - t))

        print("done", file=stderr)
        print("%d calls with likelihood <%f" % (len(final_calls), ll_cutoff),
              file=stderr)
        return final_calls
Esempio n. 4
0
        #colors = ["red", "chocolate", "darkgreen", "gold", "darkcyan", "darkblue", "darkorchid"] + list(plt.cm.Greys(np.linspace(0.5, 1, 2)))
    elif args.pop_color_scheme == "primate":
        label = "species_label"
        n_colors = len(pop_data[label].unique().tolist())   
        colors = ["royalblue"] + list(plt.cm.Greys(np.linspace(1, 0.25, 4)))
    else:
        if args.custom_label is None:
            print "Must specify custom label for custom color scheme"
            sys.exit(1)
        label = args.custom_label
        n_colors = len(pop_data[label].unique().tolist())
        colors = list(plt.cm.hsv(np.linspace(0, 0.9, n_colors-2))) + list(plt.cm.Greys(np.linspace(0.5, 1, 2)))

    indivs = pop_data.sample.unique().tolist()

    g = gt.genotyper(args.contig, gglob_dir = args.gglob_dir, plot_dir = args.plot_dir, subset_indivs = indivs, fn_fa=args.fn_fa, dup_tabix = args.fn_dup_tabix, GC_inf = args.fn_GC_DTS)
    X, idx_s, idx_e = g.get_gt_matrix(args.contig, args.start - args.pad, args.end + args.pad)
    starts = g.wnd_starts[idx_s:idx_e]
    data = pd.DataFrame(X, index = g.indivs, columns = map(str, starts))

    pop_data = pop_data.loc[pop_data["sample"].isin(g.indivs)]
    unique_labels = pop_data[label].unique().tolist()
    pop_data["color"] = pop_data[label].map(lambda x: colors[unique_labels.index(x)])
    pop_data["label"] = pop_data[label]

    alpha = 0.6

    fig = plt.figure(figsize=(24,15))

    if args.genes is not None:
        genes_h = 0.1
Esempio n. 5
0
    GC_inf = GC_data(args.fn_GC_DTS, args.contig, args.fn_DTS_contigs)

    if args.subset_indivs is not None:
        indivs = args.subset_indivs
    elif args.manifest is not None:
        indivs = pd.read_table(args.manifest,
                               header=0).sample.unique().tolist()
    else:
        indivs = list(pd.read_json("%s/gglob.idx" % args.gglob_dir).indivs)

    # GENOTYPE TIME!

    g = gt.genotyper(args.contig,
                     gglob_dir=args.gglob_dir,
                     plot_dir=args.plot_dir,
                     subset_indivs=indivs,
                     fn_fa=args.fn_fa,
                     dup_tabix=tbx_dups,
                     GC_inf=GC_inf)

    regions = pd.read_csv(args.fn_regions,
                          header=None,
                          delimiter="\t",
                          index_col=None)
    regions.columns = ["chr", "start", "end", "name"]
    regions_by_contig = regions[regions['chr'] == args.contig]
    nregions = regions_by_contig.shape[0]

    FOUT = open(args.fn_out, 'w')
    if args.contig == args.header_chr and subset == 0:
        FOUT.write("chr\tstart\tend\tname\t%s\n" % ("\t".join(indivs)))
Esempio n. 6
0
    def resolve_overlapping_clusters(self, ll_cutoff, 
                                           tbx_dups, 
                                           indiv_id, 
                                           indiv_DTS, 
                                           ref_DTSs, 
                                           dCGHs, 
                                           gglob_dir, 
                                           out_viz_dir,
                                           verbose=False, 
                                           min_overlapping=2, 
                                           subset_indivs=None,
                                           min_d=0):
        """
        resolve overlapping clusters into individual calls 
        let the calls have likelihoods
            1. do the recip overlap cluster - clusters very similar calls w/ similar break-points
            2. make sure there are at least 3 calls in there (w/ similar breakpoints)
            3. make sure those calls sum to a log likelihood of <3
            4. collapse overlaps 
            5. get the best breakpoints
        """
        print >>stderr, "resolving breakpoints..."
        final_calls = []
        
        for chr, overlapping_calls in self.overlapping_calls_by_chr.iteritems():
            print >>stderr, chr, "%d calls in this chr"%(len(overlapping_calls))
            
            indiv_cps = indiv_DTS.get_cps_by_chr(chr) 
            ref_cps = {}
            for ref, refDTS in ref_DTSs.iteritems():
                ref_cps[ref] = refDTS.get_cps_by_chr(chr) 
            
            g = genotyper(chr, gglob_dir=gglob_dir, plot_dir=out_viz_dir, subset_indivs = subset_indivs) 

            curr_dCGHs = self.get_curr_chr_dCGHs(chr, dCGHs)
            wnd_starts, wnd_ends = indiv_DTS.get_wnds_by_chr(chr)

            t=time.time()
            n_assessed=-1
            for overlap_cluster in overlapping_calls:
                overlap_cutoff = 0.85
                #overlap_cutoff = 0.75
                n_assessed+=1
                
                resolved_calls = overlap_cluster.overlap_resolve(overlap_cutoff, 
                                                                 ll_cutoff, 
                                                                 tbx_dups, 
                                                                 min_overlapping=min_overlapping) 
                
                if len(resolved_calls) == 0: continue
                
                variable_clusts = []
                for clust in resolved_calls:
                    """
                    now take all these resolved calls,  
                    and genotype to ensure this seg is var
                    """
                    d = self.get_delta(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps)
                    if clust.size == 1: 
                        print "skipping single call cluster"
                        continue
                    if d > 1.0:
                        variable_clusts.append(clust)
                    elif d>min_d:
                        X, idx_s, idx_e = g.get_gt_matrix(chr, clust.get_med_start(), clust.get_med_end())
                        gX = g.GMM_genotype(X)
                        if gX.gmm.n_components>1 and gX.is_var(indiv_id, g):
                            variable_clusts.append(clust)
                
                overlapping_call_clusts = get_overlapping_call_clusts(variable_clusts)

                for clust in overlapping_call_clusts:
                    final_call = self.get_final_call(clust)
                    final_calls.append(final_call)
                

                """
                if min(overlap_cluster.all_starts)>14454554 and min(overlap_cluster.all_starts)<14933135:
                self.plot_call(clust, wnd_starts, wnd_ends, curr_dCGHs, indiv_cps, ref_cps)
                raw_input()
                """
                #overlapping_call_clusts = get_overlapping_call_clusts(resolved_calls, flatten = True)
            print "n-final_calls:%d n_assessed:%d time_elapsed:%fs"%(len(final_calls), n_assessed, time.time()-t)

        print >>stderr, "done"
        print >>stderr, "%d calls with likelihood <%f"%(len(final_calls), ll_cutoff)
        return final_calls