예제 #1
0
def main():
    """
    Calculate the average coverage of one or more beta files.
    Print the results.
    """

    args = parse_args()

    sites = GenomicRegion(args).sites

    blocks_df = load_blocks_file(args.bed_file) if args.bed_file else None

    params = [(beta, sites, blocks_df, False) for beta in args.betas]
    # covs = [beta_cov(*p) for p in params]
    # return
    p = Pool(args.threads)
    covs = p.starmap(beta_cov, params)
    p.close()
    p.join()

    for cov, beta_path in zip(covs, args.betas):
        print('{}\t{:.2f}'.format(pretty_name(beta_path), cov))

    if args.plot:
        plot_hist([pretty_name(b) for b in args.betas], covs)
예제 #2
0
파일: homog.py 프로젝트: nloyfer/wgbs_tools
def main():
    """
    Generage homog files. Given a blocks file and pat[s],
    count the number of U,X,M reads for each block for each file
    """

    args = parse_args()
    if args.nr_bits not in (8 , 16):
        raise IllegalArgumentError('nr_bits must be in {8, 16}')
    if args.rlen < 3:
        raise IllegalArgumentError('rlen must be >= 3')
    if args.thresholds is not None:
        th = args.thresholds.split(',')
        if not len(th) == 2: # and th[0].is_number():
            raise IllegalArgumentError('Invalid thresholds')
        th = float(th[0]), float(th[1])
        if not (1 > th[1] > th[0] > 0):
            raise IllegalArgumentError('Invalid thresholds')
    # make sure homog tool is valid:
    validate_local_exe(homog_tool)

    pats = args.input_files
    validate_file_list(pats, '.pat.gz')

    outdir, prefix = parse_outdir_prefix(args)

    # load blocks:
    blocks_df = load_blocks_file(args.blocks_file)
    is_nice, msg = is_block_file_nice(blocks_df)
    if not is_nice:
        homog_log(msg)
        raise IllegalArgumentError(f'Invalid blocks file: {args.blocks_file}')

    for pat in sorted(pats):
        homog_process(pat, blocks_df, args, outdir, prefix)
예제 #3
0
    def load_blocks(self):
        # load blocks file and filter it by CpG and bg length

        df = load_blocks_file(self.args.blocks_path)
        orig_nr_blocks = df.shape[0]

        # filter by lenCpG
        df['lenCpG'] = df['endCpG'] - df['startCpG']
        df = df[df['lenCpG'] >= self.args.min_cpg]
        df = df[df['lenCpG'] <= self.args.max_cpg]

        # filter by len in bp
        df['len'] = df['end'] - df['start']
        df = df[df['len'] >= self.args.min_bp]
        df = df[df['len'] <= self.args.max_bp]

        df.reset_index(drop=True, inplace=True)

        # print stats
        if self.verbose:
            eprint(f'loaded {orig_nr_blocks:,} blocks')
            if df.shape[0] != orig_nr_blocks:
                eprint(f'droppd to {df.shape[0]:,} ')

        return df
예제 #4
0
    def break_to_chunks(self):
        """ Break range of sites to chunks of size 'step',
            while keeping chromosomes separated """
        # print a warning in case chunk size is too small
        step = self.args.chunk_size
        if step < self.args.max_cpg:
            msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \
                  '                      It may cause wt segment to fail. It\'s best setting\n' \
                  '                      chunk_size > min{max_cpg, max_bp/2}'
            eprint(msg)

        if self.args.bed_file:
            df = load_blocks_file(self.args.bed_file)[['startCpG',
                                                       'endCpG']].dropna()
            # make sure bed file has no overlaps or duplicated regions
            is_nice, msg = is_block_file_nice(df)
            if not is_nice:
                msg = '[wt segment] ERROR: invalid bed file.\n' \
                      f'                    {msg}\n' \
                      f'                    Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \
                      'bedtools merge -i - | wgbstools convert --drop_empty -p -L -'
                eprint(msg)
                raise IllegalArgumentError('Invalid bed file')
            if df.shape[0] > 2 * 1e4:
                msg = '[wt segment] WARNING: bed file contains many regions.\n' \
                      '                      Segmentation will take a long time.\n' \
                      f'                      Consider running w/o -L flag and intersect the results\n'
                eprint(msg)

        else:  # No bed file provided
            gr = GenomicRegion(self.args)
            # whole genome - make a dummy "bed file" of the full chromosomes
            if gr.is_whole():
                cf = self.genome.get_chrom_cpg_size_table()
                cf['endCpG'] = np.cumsum(cf['size']) + 1
                cf['startCpG'] = cf['endCpG'] - cf['size']
                df = cf[['startCpG', 'endCpG']]
            # one region
            else:
                df = pd.DataFrame(columns=['startCpG', 'endCpG'],
                                  data=[gr.sites])

        # build a DataFrame of chunks, with a "tag"/label field,
        # so we know which chunks to merge later on.
        rf = pd.DataFrame()
        tags = []
        starts = []
        ends = []
        for ind, row in df.iterrows():
            start, end = row
            bords = list(range(start, end, step)) + [end]
            tags += [f'{start}-{end}'] * (len(bords) - 1)
            starts += bords[:-1]
            ends += bords[1:]
        return tags, starts, ends
예제 #5
0
def beta2table_generator(betas,
                         blocks,
                         groups_file,
                         min_cov,
                         threads,
                         chunk_size=None,
                         verbose=False):
    validate_single_file(blocks)
    gf = groups_load_wrap(groups_file, betas)
    blocks_df = load_blocks_file(blocks)
    if chunk_size is None:
        chunk_size = blocks_df.shape[0]
    for start in range(0, blocks_df.shape[0], chunk_size):
        subset_blocks = blocks_df.iloc[start:start + chunk_size].copy()
        yield get_table(subset_blocks, gf, min_cov, threads, verbose)
예제 #6
0
    def __init__(self, args):
        eprint('mixing...')
        self.args = args
        self.gr = GenomicRegion(args)
        self.pats = args.pat_files
        self.dest_cov = args.cov
        self.bed = load_blocks_file(args.bed_file) if args.bed_file else None
        self.stats = pd.DataFrame(
            index=[splitextgz(op.basename(f))[0] for f in self.pats])
        self.nr_pats = len(self.pats)
        self.labels = self.validate_labels(args.labels)

        self.dest_rates = self.validate_rates(args.rates)
        self.covs = self.read_covs()
        self.adj_rates = self.adjust_rates()

        self.prefix = self.generate_prefix(args.out_dir, args.prefix)
예제 #7
0
 def read_covs(self):
     covs = []
     for pat in self.pats:
         suff = '.lbeta' if self.args.lbeta else '.beta'
         beta = pat.replace('.pat.gz', suff)
         if not op.isfile(beta):
             eprint('No {} file compatible to {} was found. Generate it...'.
                    format(suff, pat))
             pat2beta(pat, op.dirname(pat), args=self.args, force=True)
         if self.bed is not None:
             cov = beta_cov_by_bed(beta, self.bed)
         elif self.args.bed_cov:  # todo: this is messy. fix it. Better read coverage from pat file.
             cov = beta_cov_by_bed(beta,
                                   load_blocks_file(self.args.bed_cov))
         else:
             cov = beta_cov(beta, self.gr.sites, print_res=True)
         covs.append(cov)
     self.add_stats_col('OrigCov', covs)
     return covs
예제 #8
0
파일: cview.py 프로젝트: nloyfer/wgbs_tools
def view_bed(pat, args):
    # assume columns 4-5 of args.bed_file are startCpG, endCpG:
    bpath = args.bed_file

    # validate blocks file. If it's long, and starts with "chr1", use gunzip instead of tabix.
    df = load_blocks_file(bpath, nrows=1e6)
    if df.shape[0] == 1e6 and df.iloc[0, 0] in ('1', 'chr1'):
        tabix_cmd = f'gunzip -c {pat} '
    else:
        # extended blocks:
        tabix_cmd = 'gunzip -c' if bpath.endswith('.gz') else 'cat'
        tabix_cmd += f' {bpath} | {cview_extend_blocks_script} | tabix -R - {pat} '

    view_flags = set_view_flags(args)
    cmd = tabix_cmd + f' | {cview_tool} {view_flags} --blocks_path {bpath}'
    if args.sub_sample is not None:  # sub-sample reads
        validate_local_exe(pat_sampler)
        cmd += f' | {pat_sampler} {args.sub_sample} '
    cmd += f' | sort -k2,2n -k3,3 | {collapse_pat_script} - '
    if args.out_path is not None:
        cmd += f' > {args.out_path}'
    subprocess_wrap_sigpipe(cmd)
예제 #9
0
def betas2table(betas, blocks, groups_file, min_cov, threads=8, verbose=False):
    validate_single_file(blocks)
    gf = groups_load_wrap(groups_file, betas)
    blocks_df = load_blocks_file(blocks)
    return get_table(blocks_df, gf, min_cov, threads, verbose)