Ejemplo n.º 1
0
def genotype_DTS_regions(dts_list_file, regions_file, contigs, window):
    """
    Get copy number from each DenseTrackSet for each region in the given file.
    """
    # Load supported contigs.
    with open(contigs, "r") as fh:
        supported_contigs = set([line.strip().split("\t")[0] for line in fh])

    regions_by_chromosome = get_regions(regions_file, supported_contigs)


    # Load DTS file list.
    with open(dts_list_file, "r") as fh:
        dts_list = [line.strip() for line in fh]

    # Build a sorted list of sample names.
    dts_list.sort()
    sample_names = [os.path.basename(dts) for dts in dts_list]

    yield ["chromosome", "start", "end"] + sample_names

    for chromosome, regions in regions_by_chromosome.iteritems():
        copies_by_sample = {}

        for i in xrange(len(dts_list)):
            dts = dts_list[i]
            sample_name = sample_names[i]
            sample = wnd_cp_indiv(dts, contigs, window)

            copies_by_sample[sample_name] = sample.get_cp_by_regions(
                chromosome,
                regions["starts"],
                regions["ends"]
            )

        for i in xrange(len(regions["starts"])):
            yield list([chromosome, str(regions["starts"][i]), str(regions["ends"][i])] +
                       [str(copies_by_sample[sample][i]) for sample in sample_names])
Ejemplo n.º 2
0
    parser.add_argument('--window_size', type=int, default=None, help='Size of SUNK/wssd sliding windows')
    parser.add_argument('--min_ref_cp_delta', dest='min_d', type=float, default=0, help='Smallest difference in cp between ref and sample to consider (Default: %(default)s)')
    parser.add_argument('--no_P_value_adjust', dest='P_adjust', action='store_false')
    parser.add_argument('--min_mu', default=0.5, type=float, help='Minimum cluster mean (Default: %(default)s)')
    parser.add_argument('--subset_indivs', default=None, help='Colon-separated list of individuals to consider (Default: %(default)s)')

    o = parser.parse_args()    


    subset_indivs = o.subset_indivs
    
    if subset_indivs != None:
        subset_indivs = subset_indivs.split(":")
        subset_indivs = list(set(subset_indivs))

    indiv_DTS = wnd_cp_indiv(o.fn_indiv_DTS, o.fn_contigs, o.window_size) 
    indiv_id = o.fn_indiv_DTS.split("/")[-1].replace("500_bp_","")

    ref_DTSs = {}
    dCGHs = {}
    for fn_ref in o.fn_ref_DTS.split(":"):
        dCGHs[fn_ref.split("/")[-1].replace("500_bp_","")] = dCGH(o.fn_indiv_DTS, 
                                                                  fn_ref,
                                                                  o.fn_contigs,
                                                                  o.window_size)
        
        ref_DTSs[fn_ref.split("/")[-1].replace("500_bp_","")] = wnd_cp_indiv(fn_ref, 
                                                                             o.fn_contigs, 
                                                                             o.window_size) 
        
    call_table = cluster.indiv_callset_table(o.fn_call_table) 
Ejemplo n.º 3
0
    opts.add_option('','--fn_DTS',dest='fn_DTS', default=None)
    opts.add_option('','--contigs',dest='fn_contigs', default=None)
    opts.add_option('','--wnd_size',dest='wnd_size', type=int, default=None)
    #opts.add_option('','--wnd_slide',dest='wnd_slide', type=int, default=None)
    opts.add_option('','--out_dir',dest='out_dir')
    opts.add_option('','--fn_out',dest='fn_out')
    opts.add_option('','--contig_prefix',dest='contig_prefix', default="")
    opts.add_option('','--DTS_prefix',dest='DTS_prefix', default="500_bp_")
    opts.add_option('','--output_contigs',dest='output_contigs', default="/net/eichler/vol7/home/psudmant/genomes/contigs/hg19_contigs.txt")

    
    (o, args) = opts.parse_args()
    #usage, init, then run
    
    indiv = o.fn_DTS.split("/")[-1].replace("500_bp_","")
    wnd_cp = wnd_cp_indiv(o.fn_DTS, o.fn_contigs, o.wnd_size)
    """
    outstr:
    chr start end indiv 0 0 0 color
    """
    
    c_out = output(o.contig_prefix, o.output_contigs) 
    for contig in wnd_cp.contigs:   
        print  >>stderr, contig
        
        cps = wnd_cp.get_cps_by_chr(contig)
        wnd_starts, wnd_ends = wnd_cp.get_wnds_by_chr(contig)
         
        prev_start = 0
        for i in xrange(0, cps.shape[0]-1):
            s, e = wnd_starts[i], wnd_ends[i]
Ejemplo n.º 4
0
    def init_from_DTS(cls, **kwargs):
        """
        requires the below inputs

        gglob.init_from_DTS(DTS_dir = DTS_dir,
                            DTS_prefix = DTS_prefix,
                            sunk_DTS_dir = sunk_DTS_dir,
                            sunk_DTS_prefix = sunk_DTS_prefix,
                            wnd_size = wnd_size,
                            indivs = indivs,
                            contig = contig,
                            fn_contigs = fn_contigs,
                            fn_sunk_contigs = fn_sunk_contigs)

        """

        DTS_dir = kwargs["DTS_dir"]
        DTS_prefix = kwargs["DTS_prefix"]

        sunk_DTS_dir = kwargs["sunk_DTS_dir"]
        sunk_DTS_prefix = kwargs["sunk_DTS_prefix"]

        wnd_size = kwargs['wnd_size']
        wnd_slide = kwargs['wnd_slide']

        indivs = kwargs['indivs']
        contig = kwargs['contig']

        fn_contigs = kwargs['fn_contigs']
        fn_sunk_contigs = kwargs['fn_sunk_contigs']

        DTS_pre = "%s/%s" % (DTS_dir, DTS_prefix)
        sunk_DTS_pre = "%s/%s" % (sunk_DTS_dir, DTS_prefix)

        n_indivs = len(indivs)

        t = time.time()
        rand_wnd_cp = wnd_cp_indiv("%s%s" % (DTS_pre, indivs[0]), fn_contigs,
                                   wnd_size)
        wnd_starts, wnd_ends = rand_wnd_cp.get_wnds_by_chr(contig)
        cp_matrix = np.zeros((n_indivs, wnd_starts.shape[0]))

        rand_sunk_wnd_cp = wnd_cp_indiv("%s%s" % (sunk_DTS_pre, indivs[0]),
                                        fn_sunk_contigs, wnd_size)
        sunk_wnd_starts, sunk_wnd_ends = rand_sunk_wnd_cp.get_wnds_by_chr(
            contig)
        sunk_cp_matrix = np.zeros((n_indivs, sunk_wnd_starts.shape[0]))

        correct = not (contig in ["chrY", "chrX"])

        for i, indiv in enumerate(indivs):
            print(indiv, file=stderr)
            wnd_cp = wnd_cp_indiv("%s%s" % (DTS_pre, indiv), fn_contigs,
                                  wnd_size)

            cp_matrix[i, :] = wnd_cp.get_cps_by_chr(contig, correct=correct)

            sunk_wnd_cp = wnd_cp_indiv("%s%s" % (sunk_DTS_pre, indiv),
                                       fn_sunk_contigs, wnd_size)

            sunk_cp_matrix[i, :] = sunk_wnd_cp.get_cps_by_chr(contig,
                                                              correct=correct)

        return cls(indivs=indivs,
                   wnd_size=wnd_size,
                   wnd_slide=wnd_slide,
                   contig=contig,
                   wnd_starts=wnd_starts,
                   wnd_ends=wnd_ends,
                   cp_matrix=cp_matrix,
                   sunk_wnd_starts=sunk_wnd_starts,
                   sunk_wnd_ends=sunk_wnd_ends,
                   sunk_cp_matrix=sunk_cp_matrix)
Ejemplo n.º 5
0
    def init_from_DTS(cls, **kwargs):
        """
        requires the below inputs

        gglob.init_from_DTS(DTS_dir = DTS_dir,
                            DTS_prefix = DTS_prefix,
                            sunk_DTS_dir = sunk_DTS_dir,
                            sunk_DTS_prefix = sunk_DTS_prefix,
                            wnd_size = wnd_size,
                            indivs = indivs,
                            contig = contig,
                            fn_contigs = fn_contigs,
                            fn_sunk_contigs = fn_sunk_contigs)

        """
        
        DTS_dir = kwargs["DTS_dir"]
        DTS_prefix = kwargs["DTS_prefix"]
        
        sunk_DTS_dir = kwargs["sunk_DTS_dir"]
        sunk_DTS_prefix = kwargs["sunk_DTS_prefix"]
        
        wnd_size = kwargs['wnd_size']
        wnd_slide = kwargs['wnd_slide']

        indivs = kwargs['indivs']
        contig = kwargs['contig']

        fn_contigs = kwargs['fn_contigs']
        fn_sunk_contigs = kwargs['fn_sunk_contigs']

        DTS_pre="%s/%s"%(DTS_dir, DTS_prefix) 
        sunk_DTS_pre="%s/%s"%(sunk_DTS_dir, DTS_prefix) 
        
        n_indivs = len(indivs)
        
        t = time.time()
        rand_wnd_cp = wnd_cp_indiv("%s%s"%(DTS_pre, indivs[0]), fn_contigs, wnd_size)
        wnd_starts, wnd_ends = rand_wnd_cp.get_wnds_by_chr(contig)
        cp_matrix = np.zeros((n_indivs, wnd_starts.shape[0]))

        rand_sunk_wnd_cp = wnd_cp_indiv("%s%s"%(sunk_DTS_pre, indivs[0]), fn_sunk_contigs, wnd_size)
        sunk_wnd_starts, sunk_wnd_ends = rand_sunk_wnd_cp.get_wnds_by_chr(contig)
        sunk_cp_matrix = np.zeros((n_indivs, sunk_wnd_starts.shape[0]))
        
        correct = not (contig in ["chrY", "chrX"])

        for i, indiv in enumerate(indivs):
            print >> stderr, indiv
            wnd_cp = wnd_cp_indiv("%s%s"%(DTS_pre, indiv),
                                  fn_contigs,
                                  wnd_size)
            
            cp_matrix[i,:] = wnd_cp.get_cps_by_chr(contig, correct=correct) 

            sunk_wnd_cp = wnd_cp_indiv("%s%s"%(sunk_DTS_pre, indiv), 
                                      fn_sunk_contigs,
                                      wnd_size)
            
            sunk_cp_matrix[i,:] = sunk_wnd_cp.get_cps_by_chr(contig, correct=correct) 
        
        return cls(indivs = indivs,
                   wnd_size = wnd_size,
                   wnd_slide = wnd_slide,
                   contig = contig,
                   wnd_starts = wnd_starts,
                   wnd_ends = wnd_ends, 
                   cp_matrix = cp_matrix, 
                   sunk_wnd_starts = sunk_wnd_starts,
                   sunk_wnd_ends = sunk_wnd_ends,
                   sunk_cp_matrix = sunk_cp_matrix)
Ejemplo n.º 6
0
    opts.add_option('', '--fn_out', dest='fn_out')
    opts.add_option('', '--contig_prefix', dest='contig_prefix', default="")
    opts.add_option('', '--DTS_prefix', dest='DTS_prefix', default="500_bp_")
    opts.add_option(
        '',
        '--output_contigs',
        dest='output_contigs',
        default=
        "/net/eichler/vol27/projects/human_diversity/nobackups/hsiehph/genomicData/EEElab/read_depth_genotyper/hg19_contigs.txt"
    )

    (o, args) = opts.parse_args()
    #usage, init, then run

    indiv = o.fn_DTS.split("/")[-1].replace("500_bp_", "")
    wnd_cp = wnd_cp_indiv(o.fn_DTS, o.fn_contigs, o.wnd_size)
    """
    outstr:
    chr start end indiv 0 0 0 color
    """

    c_out = output(o.contig_prefix, o.output_contigs)
    for contig in wnd_cp.contigs:
        print(contig, file=stderr)

        cps = wnd_cp.get_cps_by_chr(contig)
        wnd_starts, wnd_ends = wnd_cp.get_wnds_by_chr(contig)

        prev_start = 0
        for i in range(0, cps.shape[0] - 1):
            s, e = wnd_starts[i], wnd_ends[i]
Ejemplo n.º 7
0
    chrs = []
    for c in tbx_gc.contigs:
        if (not "random" in c) and (not "X" in c) and (not "Y" in c):
            chrs.append(c)
    
    fn_contigs = o.fn_contigs
    wnd =  int(o.window_size)

    ##
    #cutoff_scale=float(bp_cutoff_scale/wnd)
    #max_merge = o.max_merge_dif 
    #cp_data = dCGH(o.fn_in_DTS,o.fn_ref_DTS,o.fn_contigs,wnd)    
    #segment_callset = callset()
    #caller_by_chr = {}
    #dCGH(o.fn_in_DTS,o.fn_ref_DTS,o.fn_contigs,wnd)    
    cp_data = wnd_cp_indiv(o.fn_in_DTS,o.fn_contigs,wnd)
    
    GC_DTS = DenseTrackSet(o.fn_contigs,o.fn_GC_DTS,overwrite=False,openMode='r')
    null_dist = null_distribution(tbx_gc)
    
    for chr in chrs:
        print >>stderr,"%s..."%chr
        magnitude_vect = cp_data.get_cps_by_chr(chr)
        starts_vect,ends_vect = cp_data.get_wnds_by_chr(chr) 

        #plot_GC(chr,tbx_gc,magnitude_vect,starts_vect,ends_vect)
        #print magnitude_vect[0:1000] 
        gapped_wnds = cp_data.get_overlapping_wnds(chr,tbx_gaps) 
        segdup_wnds = cp_data.get_overlapping_wnds(chr,tbx_dups) 
        null_dist.add(magnitude_vect,[gapped_wnds,segdup_wnds])
        
Ejemplo n.º 8
0
    parser.add_argument(
        '--subset_indivs',
        default=None,
        help=
        'Colon-separated list of individuals to consider (Default: %(default)s)'
    )

    o = parser.parse_args()

    subset_indivs = o.subset_indivs

    if subset_indivs != None:
        subset_indivs = subset_indivs.split(":")
        subset_indivs = list(set(subset_indivs))

    indiv_DTS = wnd_cp_indiv(o.fn_indiv_DTS, o.fn_contigs, o.window_size)
    indiv_id = o.fn_indiv_DTS.split("/")[-1].replace("500_bp_", "")

    ref_DTSs = {}
    dCGHs = {}
    for fn_ref in o.fn_ref_DTS.split(":"):
        dCGHs[fn_ref.split("/")[-1].replace("500_bp_", "")] = dCGH(
            o.fn_indiv_DTS, fn_ref, o.fn_contigs, o.window_size)

        ref_DTSs[fn_ref.split("/")[-1].replace("500_bp_", "")] = wnd_cp_indiv(
            fn_ref, o.fn_contigs, o.window_size)

    call_table = cluster.indiv_callset_table(o.fn_call_table)

    if o.limit_to_chr:
        call_table.filter_by_chr(o.limit_to_chr)