def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}') suff = '.lbeta' if args.lbeta else '.beta' out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff) if not delete_or_skip(out_beta, force): return if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): arr = mult_pat2beta(pat_path, args) else: nr_sites = GenomeRefPaths(args.genome).get_nr_sites() cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}' x = subprocess.check_output(cmd, shell=True).decode() arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2)) trim_to_uint8(arr, args.lbeta).tofile(out_beta) return out_beta
def convert_bed_file(args): """ bed file should be of the format (tab-separated): Input: chr start end [...] Output: chr start end startCpG endCpG [...] """ out_path = sys.stdout if args.out_path is None else args.out_path if not delete_or_skip(out_path, args.force): return # add CpG columns bed_file = args.bed_file # TODO: support stdin for -L in all wgbstools features, and add it to the help message if bed_file == '-': bed_file = sys.stdin add_anno = (not args.parsable) and (not args.no_anno) if not check_executable('bedtools', verbose=False): # eprint('continue with a slower implementation') r = add_cpgs_to_bed(bed_file=bed_file, genome=args.genome, drop_empty=args.drop_empty, threads=args.threads, add_anno=add_anno) else: r = bedtools_conversion(bed_file, args.genome, args.drop_empty, add_anno, args.debug) r.to_csv(out_path, sep='\t', header=None, index=None, na_rep='NA')
def main(): """ Merge files. Accumulate all reads / observations from multiple (>=2) input files, and output a single file of the same format. Supported formats: pat.gz, beta """ args = parse_args() # validate input files input_files = args.input_files # construct output path out_path = args.prefix + splitextgz(args.input_files[0])[1] if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]: eprint('[wt merge] Error output path is identical ' \ 'to one of the input files {out_path}') return if not delete_or_skip(out_path, args.force): return files_type = splitextgz(input_files[0])[1][1:] if files_type in ('beta', 'bin'): merge_betas(input_files, out_path) elif files_type == 'pat.gz': MergePats(input_files, args.prefix + '.pat.gz', args.labels, args).merge_pats() else: print('Unknown input format:', input_files[0]) return
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) region = 'chr1:10469-876225' if args.debug else None nrows = 10000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint(f'[wt bed] Converting {op.basename(bed)}...') # Check if bed should be skipped outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta') if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict_section(region, args.genome) df = load_bed(bed, nrows, args.add_one) # todo: implement in C++. # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint(f'[wt bed] Invalid input file.\n{e}') return
def main(): """ Merge files. Accumulate all reads / observations from multiple (>=2) input files, and output a single file of the same format. Supported formats: pat.gz, beta """ args = parse_args() # validate input files input_files = args.input_files validate_files_list(input_files, min_len=2) # construct output path out_path = args.prefix + splitextgz(args.input_files[0])[1] if not delete_or_skip(out_path, args.force): return files_type = splitextgz(input_files[0])[1][1:] if files_type in ('beta', 'bin'): merge_betas(input_files, out_path) elif files_type == 'pat.gz': MergePats(input_files, args.prefix + '.pat', args.labels, args).merge_pats() elif files_type == 'unq.gz': merge_unqs() else: print('Unknown input format:', input_files[0]) return
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) nrows = 100000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint('Converting {}...'.format(op.basename(bed))) # Check if bed should be skipped: outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0]) + '.beta' if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict(nrows=nrows, genome_name=args.genome) df = load_bed(bed, nrows, args.genome == 'mm9') # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint('Invalid input file.\n{}'.format(e)) return
def beta_to_bed(beta_path, gr, bed_file, min_cov, mean, keep_na, force, opath): if not delete_or_skip(opath, force): return cmd = beta2bed_build_cmd(beta_path, gr, bed_file, min_cov, mean, keep_na) if opath is not None: if opath.endswith('.gz'): cmd += ' | gzip -c ' cmd += f' > {opath}' subprocess_wrap_sigpipe(cmd)
def convert_site_file(args): """ site file should be of the format (tab-separated): Input: startCpG [endCpG] Output: chr start end startCpG [endCpG] """ out_path = sys.stdout if args.out_path is None else args.out_path if not delete_or_skip(out_path, args.force): return # add loci columns add_bed_to_cpgs(args.site_file, args.genome, args.out_path)
def run(self): # if index already exists delete it or skip it if not delete_or_skip(self.in_file + self.ftype.ind_suff, self.force): return # if file is gzipped instead of bgzipped, uncompress it if self.is_file_gzipped(): sp.check_call(['gunzip', self.in_file]) self.in_file = self.in_file[:-3] if not self.in_file.endswith('.gz'): self.bgzip() self.index_bgzipped_file()
def __init__(self, args): self.args = args self.out_path = args.out_path self.debug = args.debug if not delete_or_skip(self.out_path, self.args.force): return # load bed file: self.df = load_bed(args.bed_path, 100000 if self.debug else None) self.genome = GenomeRefPaths(args.genome) # load chromosomes sizes (in GpGs): self.cf = self.genome.get_chrom_cpg_size_table() self.cf['size'] = np.cumsum(self.cf['size']) self.proc_bed()
def __init__(self, args): self.args = args self.ref_path = args.genome_ref self.force = args.force self.name = args.name self.out_dir = self.make_output_dir() # validate input files validate_single_file(self.ref_path, '.fa') # abort if files exists and --force was not specified eprint('Setting up genome reference files in {}'.format(self.out_dir)) if not delete_or_skip(op.join(self.out_dir, 'CpG.bed.gz'), self.force): return self.fai_df = self.load_fai()
def run_beta_to_bed(self, beta_path): eprint('{}'.format(op.basename(beta_path))) prefix = self.set_prefix(beta_path) out_bed = prefix + '.bed' if not delete_or_skip(out_bed, self.args.force): return barr = self.load_beta(beta_path) # paste dict with beta, then dump self.ref_dict['meth'] = barr[:, 0] self.ref_dict['total'] = barr[:, 1] self.ref_dict[self.ref_dict['total'] > 0].to_csv(out_bed, sep='\t', header=None, index=None) del self.ref_dict['meth'], self.ref_dict['total']
def run(self): # if index already exists delete it or skip it if not delete_or_skip(self.in_file + self.ftype.ind_suff, self.args.force): return if self.in_file.endswith('.gz'): self.in_file = op.splitext(self.in_file)[0] # try indexing it: if not self.index_bgzipped_file(): return # success # couldn't index because the file is gzipped instead of bgzipped subprocess.check_call(['gunzip', self.in_file + '.gz']) self.bgzip() self.index_bgzipped_file()
def single_mix(self, rep): prefix_i = self.prefix + '_{}.pat'.format(rep + 1) if not delete_or_skip(prefix_i + '.gz', self.args.force): return view_flags = [] for i in range(self.nr_pats): v = ' --awk ' if self.args.strict: v += ' --strict' if self.args.bed_file is not None: v += ' -L {}'.format(self.args.bed_file) elif self.gr.sites is not None: v += ' -s {}-{}'.format(*self.gr.sites) v += ' --sub_sample {}'.format(self.adj_rates[i]) view_flags.append(v) print('prefix:', prefix_i) m = MergePats(self.pats, prefix_i, self.labels, args=self.args) m.fast_merge_pats(view_flags=view_flags)
def main(): """ Run the WGBS pipeline to generate pat & beta files out of an input bam file """ parser = add_args_snp_splitt() args = parse_args_snp_split(parser) # validate output dir: if not op.isdir(args.out_dir): raise IllegalArgumentError(f'Invalid output dir: {args.out_dir}') validate_local_exe(allele_split_tool) for bam in [args.bam]: if not validate_bam(bam): eprint(f'[wt bam2pat] Skipping {bam}') continue pat = op.join(args.out_dir, op.basename(bam)[:-4] + PAT_SUFF) if not delete_or_skip(pat, args.force): continue SNPSplit(args, bam)
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path)) out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta') if not delete_or_skip(out_beta, force): return nr_sites = GenomeRefPaths(args.genome).nr_sites if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): return mult_pat2beta(pat_path, out_beta, nr_sites, args) cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites) subprocess.check_call(cmd, shell=True) return out_beta
def main(): """ Run the WGBS pipeline to generate pat & beta files out of an input bam file """ parser = argparse.ArgumentParser(description=main.__doc__) parser = add_args(parser) args = parse_args(parser) # validate output dir: if not op.isdir(args.out_dir): raise IllegalArgumentError(f'Invalid output dir: {args.out_dir}') validate_local_exe(match_maker_tool) validate_local_exe(patter_tool) for bam in args.bam: if not validate_bam(bam): eprint(f'[wt bam2pat] Skipping {bam}') continue pat = op.join(args.out_dir, op.basename(bam)[:-4] + PAT_SUFF) if not delete_or_skip(pat, args.force): continue Bam2Pat(args, bam)
def single_mix(self, rep): mix_i = self.prefix + f'_{rep + 1}.pat.gz' if not delete_or_skip(mix_i, self.args.force): return view_flags = [] for i in range(self.nr_pats): v = ' ' if self.args.strict: v += ' --strict' if self.args.strip: v += ' --strip' if self.args.min_len: v += f' --min_len {self.args.min_len}' if self.args.bed_file is not None: v += ' -L {}'.format(self.args.bed_file) elif not self.gr.is_whole(): v += ' -s {}-{}'.format(*self.gr.sites) v += ' --sub_sample {}'.format(self.adj_rates[i]) view_flags.append(v) eprint('mix:', mix_i) m = MergePats(self.pats, mix_i, self.labels, args=self.args) m.fast_merge_pats(view_flags=view_flags)
def run_beta_to_bw(self, beta_path): eprint('{}'.format(op.basename(beta_path))) prefix = self.set_prefix(beta_path) out_bigwig = prefix + BW_EXT out_bed_graph = prefix + BG_EXT cov_bigwig = prefix + COV_BW_EXT cov_bed_graph = prefix + COV_BG_EXT # Check if the current file should be skipped: if not delete_or_skip(out_bigwig, self.args.force): return # load beta file barr = self.load_beta(beta_path) # dump coverage: if self.args.dump_cov: eprint('Dumping cov...') self.ref_dict['cov'] = barr[:, 1] sort_and_dump_df( self.ref_dict[self.ref_dict['cov'] >= self.args.min_cov], cov_bed_graph) del self.ref_dict['cov'] # convert bedGraph to bigWig: self.bed_graph_to_bigwig(cov_bed_graph, cov_bigwig) # dump beta values to bedGraph eprint('Dumping beta vals...') self.ref_dict['beta'] = np.round(beta2vec(barr, na=-1), 3) if self.args.remove_nan: self.ref_dict = self.ref_dict[self.ref_dict['beta'] != -1] sort_and_dump_df(self.ref_dict, out_bed_graph) del self.ref_dict['beta'] # convert bedGraphs to bigWigs: self.bed_graph_to_bigwig(out_bed_graph, out_bigwig)
def run_beta_to_bw(self, beta_path): self.name = op.basename(beta_path) prefix = op.join(self.outdir, op.splitext(self.name)[0]) out_bigwig = prefix + BW_EXT out_bed_graph = prefix + BG_EXT # Check if the current file should be skipped: if not delete_or_skip(out_bigwig, self.args.force): return # convert beta to bed: b2bw_log(f'[{self.name}] Dumping bed...') beta_to_bed(beta_path=beta_path, gr=self.gr, bed_file=self.args.bed_file, min_cov=self.args.min_cov, mean=True, keep_na=self.args.keep_na, force=True, opath=out_bed_graph) # convert bedGraphs to bigWigs: self.bed_graph_to_bigwig(out_bed_graph, out_bigwig)