def main(): """ Visualize wgbs files Possible inputs: - pat.gz file[s] - beta files[s] """ parser = parse_args() args = parser.parse_args() if args.uxm and not (0.5 <= args.uxm <= 1): parser.error("uxm value must be between 0.5 and 1") if args.sub_sample is not None and not 1 >= args.sub_sample >= 0: parser.error('[wt vis] sub-sampling rate must be within [0.0, 1.0]') # print title if args.title: print(args.title) first_file = args.input_files[0] if first_file.endswith(('.beta', '.bin')): beta_vis_main(args) elif first_file.endswith('.pat.gz'): pat_vis_main(args) else: eprint('[wt vis] Unsupported file type:', first_file)
def load_blocks(self): # load blocks file and filter it by CpG and bg length df = load_blocks_file(self.args.blocks_path) orig_nr_blocks = df.shape[0] # filter by lenCpG df['lenCpG'] = df['endCpG'] - df['startCpG'] df = df[df['lenCpG'] >= self.args.min_cpg] df = df[df['lenCpG'] <= self.args.max_cpg] # filter by len in bp df['len'] = df['end'] - df['start'] df = df[df['len'] >= self.args.min_bp] df = df[df['len'] <= self.args.max_bp] df.reset_index(drop=True, inplace=True) # print stats if self.verbose: eprint(f'loaded {orig_nr_blocks:,} blocks') if df.shape[0] != orig_nr_blocks: eprint(f'droppd to {df.shape[0]:,} ') return df
def mbias_merge(self, name, pat_parts): if not self.args.mbias: return try: mdir = op.join(self.out_dir, name) + '.mbias' if not op.isdir(mdir): os.mkdir(mdir) tpaths = [] for x in ['OB', 'OT']: mbias_parts = [ p.replace('.pat.gz', f'.mb.{x}.txt') for p in pat_parts if p ] mbias_parts = [pd.read_csv(m, sep='\t') for m in mbias_parts] df = mbias_parts[0] for m in mbias_parts[1:]: df += m cpath = op.join(mdir, name) + f'.mbias.{x}.txt' df.to_csv(cpath, sep='\t', index=None) tpaths.append(cpath) from mbias_plot import plot_mbias plot_mbias(tpaths, mdir) except Exception as e: eprint('[wt bam2pat] failed in mbias') eprint(e)
def dump_result(self, df): if df.empty: eprint('Empty blocks array') return # sort by startCpG and filter by CpGs nr_blocks = df.shape[0] df.sort_values(by=['startCpG'], inplace=True) df = df[df.endCpG - df.startCpG > self.args.min_cpg - 1].reset_index( drop=True) # verbose nr_blocks_filt = df.shape[0] nr_dropped = nr_blocks - nr_blocks_filt eprint(f'[wt segment] found {nr_blocks_filt:,} blocks\n' \ f' (dropped {nr_dropped:,} short blocks)') # add genomic loci and dump/print temp_path = next(tempfile._get_candidate_names()) try: df.to_csv(temp_path, sep='\t', header=None, index=None) add_bed_to_cpgs(temp_path, self.genome.genome, self.args.out_path) finally: if op.isfile(temp_path): os.remove(temp_path)
def compare_all_paires(args): betas = args.betas sites = GenomicRegion(args).sites tables = [load_beta_data(b, sites) for b in betas] names = [op.splitext(op.basename(b))[0] for b in betas] # break names to lines nnames = [] k = 20 for n in names: lst = [n[0 + i:k + i] for i in range(0, len(n), k)] nn = '\n'.join(lst) nnames.append(nn) N = len(tables) fig, axs = plt.subplots(N, N) for i in range(N): for j in range(i + 1): comp2(tables[i], tables[j], args.min_cov, axs[i, j]) axs[i, 0].set_ylabel(nnames[i], fontsize=8) for j in range(N): axs[0, j].set_title(nnames[j], fontsize=8) for ax in axs.flat: ax.label_outer() fig.tight_layout() if args.outpath is not None: plt.savefig(args.outpath) eprint(f'[wt cmp] dumped figure to {args.outpath}') if args.show or args.outpath is None: plt.show()
def read_blocks_and_test(tabixed_bed_file, cur_region, pat_file, is_strict, min_len, verbose=False): tabix_cmd = f"tabix {tabixed_bed_file} {cur_region}" cur_blocks_lines = subprocess.check_output(tabix_cmd, shell=True).decode().split("\n") p_val_list = [] for line in cur_blocks_lines: if not line.strip(): continue tokens = line.split("\t") sites = (int(tokens[3]), int(tokens[4])) p_val = test_single_region(pat_file, tokens[0], sites, is_strict, min_len, should_print=False) p_val = p_val.astype(np.float32) p_val_list.append((line, p_val)) if verbose: eprint(f"[wt bimodal] finished processesing {cur_region}") return p_val_list
def apply_filter_wrapper(args, blocks_bins, finds, beta_path, df): try: # load beta file: data = load_beta_data(beta_path) # reduce to blocks: blocks_bins[-1] -= 1 reduced_data = np.add.reduceat(data, blocks_bins)[finds][:-1] # dump to file out_name = splitext(splitext(basename(args.blocks_file))[0])[0] out_name = splitext(basename(beta_path))[0] + '_' + out_name + '.bin' out_name = out_name.replace('_genome', '') out_name = op.join(args.out_dir, out_name) trim_to_uint8(reduced_data).tofile(out_name) print(out_name) if args.bedGraph: with np.errstate(divide='ignore', invalid='ignore'): beta_vals = reduced_data[:, 0] / reduced_data[:, 1] eprint(beta_vals.shape, df.shape) # beta_vals[reduced_data[:, 1] == 0] = np.nan df['beta'] = beta_vals df.to_csv(out_name.replace('.bin', '.bedGraph'), sep='\t', index=None, header=None, na_rep=-1, float_format='%.2f') except Exception as e: print('Failed with beta', beta_path) print('Exception:', e)
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) nrows = 100000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint('Converting {}...'.format(op.basename(bed))) # Check if bed should be skipped: outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0]) + '.beta' if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict(nrows=nrows, genome_name=args.genome) df = load_bed(bed, nrows, args.genome == 'mm9') # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint('Invalid input file.\n{}'.format(e)) return
def load_bins(self): if self.verbose: eprint('loading bins...') # breakpoint() nr_cols = (3 if self.args.uxm else 2) binsize = self.gf['binsize'][0] / self.orig_nr_blocks binsize /= nr_cols if binsize != int(binsize): raise IllegalArgumentError( 'Error: bin file size does not match blocks number') dtype = np.uint8 if binsize == 1 else np.uint16 dfU = pd.DataFrame() dfM = pd.DataFrame() if self.hypo: dfU = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]), dtype=np.float) if self.hyper: dfM = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]), dtype=np.float) from tqdm import tqdm # todo: only if installed for ind, row in tqdm(self.gf_nodup.iterrows(), total=self.gf_nodup.shape[0]): data = np.fromfile(row['full_path'], dtype).reshape( (-1, nr_cols))[self.keepinds, :] if self.hypo: dfU[:, ind] = table2vec(data, 'U', self.arsg.min_cov) if self.hyper: dfM[:, ind] = table2vec(data, 'M', self.arsg.min_cov) return self.array2df(dfU), self.array2df(dfM)
def __init__(self, args): self.args = args self.dfU = pd.DataFrame() self.dfM = pd.DataFrame() self.blocks = pd.DataFrame() self.nr_blocks = 0 self.orig_nr_blocks = 0 self.keepinds = None self.groups = None self.verbose = args.verbose self.hyper, self.hypo = self.set_hypo_hyper(args.hyper, args.hypo) self.validate_args() # validate output dir: if not op.isdir(args.out_dir): os.mkdir(args.out_dir) # load groups self.gf = load_groups_file(args.groups_file, args.input_dir, args.verbose) self.gf_nodup = self.gf.drop_duplicates(subset='fname').reset_index( drop=True) # validate target is in groups file target = self.args.target if target and target not in self.gf['group'].values: eprint( f'target {target} not in groups file {self.args.groups_file}') eprint('Possible targets:', sorted(self.gf['group'].unique())) raise IllegalArgumentError()
def bed2betas(args): # merge with the reference CpG bed file, # so the #lines in file will include all 28217448 sites (with NaN as 0) region = 'chr1:10469-876225' if args.debug else None nrows = 10000 if args.debug else None try: rf = None # Reference dictionary for bed in args.bed_paths: eprint(f'[wt bed] Converting {op.basename(bed)}...') # Check if bed should be skipped outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta') if not delete_or_skip(outpath, args.force): continue # Load dict (at most once) and bed if rf is None: rf = load_dict_section(region, args.genome) df = load_bed(bed, nrows, args.add_one) # todo: implement in C++. # merge dict with bed, then dump res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0) trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath) except pd.errors.ParserError as e: eprint(f'[wt bed] Invalid input file.\n{e}') return
def main(): """ Merge files. Accumulate all reads / observations from multiple (>=2) input files, and output a single file of the same format. Supported formats: pat.gz, beta """ args = parse_args() # validate input files input_files = args.input_files # construct output path out_path = args.prefix + splitextgz(args.input_files[0])[1] if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]: eprint('[wt merge] Error output path is identical ' \ 'to one of the input files {out_path}') return if not delete_or_skip(out_path, args.force): return files_type = splitextgz(input_files[0])[1][1:] if files_type in ('beta', 'bin'): merge_betas(input_files, out_path) elif files_type == 'pat.gz': MergePats(input_files, args.prefix + '.pat.gz', args.labels, args).merge_pats() else: print('Unknown input format:', input_files[0]) return
def bgzip_tabix_dict(self, dict_path): eprint('bgzip and index...') subprocess.check_call('bgzip -@ {} -f '.format(self.args.threads) + dict_path, shell=True) subprocess.check_call('tabix -Cf -b 2 -e 2 {}.gz'.format(dict_path), shell=True)
def bgzip_tabix_dict(self, dict_path): eprint('[wt init] bgzip and index...') subprocess.check_call(f'bgzip -@ {self.args.threads} -f {dict_path}', shell=True) subprocess.check_call(f'tabix -Cf -b 2 -e 2 {dict_path}.gz', shell=True) return dict_path + '.gz'
def load_seq_by_chrom(chrom, ref_path, fai_df, debug): eprint(chrom) # get chromosome's location in the fasta chrom, size, offset, width = fai_df[fai_df['chr'] == chrom].values[0] # load the chromosome's subsequence from fasta with open(ref_path, 'r') as f: f.seek(offset) nr_lines = size // ( width - 1) + 1 # number of lines to read for current chromosome to_read = nr_lines * width if debug: to_read = min(to_read, 100 * width) txt = f.read(to_read) seq = ''.join(s.strip() for s in txt.split('\n')).upper() # remove possible trailing characters (belonging to the next chromosome) end_pos = seq.rfind('>') if end_pos != -1: seq = seq[:end_pos] # validate sequence length if len(seq) != size and not debug: raise IllegalArgumentError('Error while loading {} from fasta: ' 'read {} bases instead of {}'.format( chrom, len(seq), size)) # Find CpG sites loci tf = pd.DataFrame([m.start() + 1 for m in re.finditer('CG', seq)], columns=['loc']) tf['chr'] = chrom return tf[['chr', 'loc']]
def validate_nr_sites(self, nr_sites): if self.args.debug: return d = {'mm9': 13120864, 'hg19': 28217448} if self.name in d.keys(): if nr_sites != d[self.name]: msg = f'[wt init] WARNING: number of sites of the reference genome ' msg += f'{self.name} is usually {d[self.name]}, but you got {nr_sites}' eprint(msg)
def validate_nr_sites(self, nr_sites): if self.args.debug: return d = {'mm9': 13120864, 'hg19': 28217448} if self.name in d.keys(): if nr_sites != d[self.name]: eprint('Warning: number of sites of the reference ' 'genome {} is usually {}, but you got {}'.format( self.name, d[self.name], nr_sites))
def is_region_empty(view_cmd, region, verbose): # check if there are reads in the bam file for the requested region view_cmd += ' | head -1' if not subprocess.check_output( view_cmd, shell=True, stderr=subprocess.PIPE).decode().strip(): eprint(f'[wt bam2pat] Skipping region {region}, no reads found') if verbose: eprint('[wt bam2pat] ' + view_cmd) return True return False
def main(): """ Find markers (blocks) to differentiate between two or more groups of samples (collapsed beta files or homog binary files). """ args = parse_args() if not args.uxm: eprint('Only --uxm mode is currently supported') raise NotImplementedError #todo: implement MarkersFinder(args).run()
def print_help(short=False): msg = 'Usage: wgbs_tools.py COMMAND [OPTIONS]\n\nOptional commands:\n' for key in sorted(callbacks.keys()): docs = callbacks[key].__doc__ msg += '\n- ' + key if docs and not short: msg += docs if short: msg += '\nUse [-h] or COMMAND -h flag for additional information' eprint(msg)
def main(): """ Change the default genome reference. """ args = parse_args() if args.name: set_def_ref(args.name) eprint(f'[wt def] changed default genome to {args.name}') if args.ls: print_genomes() elif not args.name: eprint('[wt def] you must specify either --name or -ls')
def load_bed(bed_path, nrows=None): try: # TODO: handle a bed with a header line? But support stdin as input... df = pd.read_csv(bed_path, sep='\t', header=None, nrows=nrows, comment='#') df.columns = COORDS_COLS3 + list(df.columns)[3:] return df except pd.errors.EmptyDataError as e: eprint(f'[wt convert] ERROR: empty bed file') raise IllegalArgumentError('Invalid bed file')
def dump_params(self): """ Dump a parameter file """ outpath = op.join(self.args.out_dir, 'params.txt') with open(outpath, 'w') as f: for key in vars(self.args): val = getattr(self.args, key) if key == 'beta_list_file': val = None if key == 'betas': val = ' '.join(val) f.write(f'{key}:{val}\n') # f.write(f'#> {sample}\n' ) eprint(f'dumped parameter file to {outpath}')
def load_data_chunk(self, blocks_df): # load methylation data from beta files collapsed to the blocks in blocks_df if self.verbose: self.chunk_count += 1 nr_samples = len(self.gf['fname'].unique()) eprint(f'{self.chunk_count}/{self.nr_chunks} ) ' \ f'loading data for {blocks_df.shape[0]:,} blocks over' \ f' {nr_samples} samples...') return get_table(blocks_df=blocks_df.copy(), gf=self.gf, min_cov=self.args.min_cov, threads=self.args.threads, verbose=False, group=False)
def get_fasta(self): # download fasta from UCSC, unless the fasta file is provided if self.ref_path is not None: validate_single_file(self.ref_path) return # no FASTA path provided. Attempt to download one ref_path = op.join(self.out_dir, f'{self.name}.fa.gz') url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz' cmd = f'curl {url} -o {ref_path}' eprint( f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}' ) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) output, error = p.communicate() if p.returncode: eprint( f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s' % (p.returncode, output.decode(), error.decode())) eprint( f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter' ) raise IllegalArgumentError(f'[wt init] No reference FASTA found') eprint( f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...' ) cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}' subprocess.check_call(cmd, shell=True) self.ref_path = ref_path
def run_single_pat(pat, args): eprint(pat) fl = FragLen(pat, args) if args.region or args.sites: x = fl.run_small_region() elif args.bed_file: x = fl.run_bed() else: x = fl.run_whole_genome() if not x.sum(): eprint(f'[wt frag] Empty list of lengths for {pat}') return # print values to stdout: if args.verbose: np.savetxt(sys.stdout, x.reshape((1, -1)), fmt='%s', delimiter=' ') # plot: if args.outdir or args.display: if args.verbose: eprint('[wt frag] plotting...') plot_hist(x.flatten(), args.max_frag_size, pat) # dump figure: if args.outdir: fpath = compose_fig_path(pat, args.outdir) if args.verbose: eprint(f'[wt frag] dumping {fpath}...') plt.savefig(fpath)
def run(self): # load all data self.blocks = self.load_blocks_file() self.dfU, self.dfM = self.load_bins() for group in sorted(self.gf['group'].unique()): if self.args.target and group != self.args.target: continue eprint(group) self.group = group tfU = self.find_markers_group(self.dfU, 'U') tfM = self.find_markers_group(self.dfM, 'M') tf = pd.concat([tfU, tfM]) self.dump_results(tf)
def set_regions(self): if self.gr.region_str: return [self.gr.region_str] cmd = f'samtools idxstats {self.bam_path} | cut -f1 ' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() if p.returncode or not output: eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" % (p.returncode, output.decode(), error.decode())) eprint(cmd) eprint('[wt bam2pat] falied to find chromosomes') return [] nofilt_chroms = output.decode()[:-1].split('\n') filt_chroms = [c for c in nofilt_chroms if 'chr' in c] if filt_chroms: filt_chroms = [ c for c in filt_chroms if re.match(r'^chr([\d]+|[XYM])$', c) ] else: filt_chroms = [c for c in nofilt_chroms if c in CHROMS] chroms = list(sorted(filt_chroms, key=chromosome_order)) if not chroms: eprint('[wt bam2pat] Failed retrieving valid chromosome names') raise IllegalArgumentError('Failed') return chroms
def add_anno(self): if self.args is None or self.is_whole() or 'no_anno' not in self.args: return elif self.args.no_anno: return anno_path = self.genome.annotations if anno_path is None: return try: cmd = f'tabix {anno_path} {self.region_str} | cut -f4- | uniq' return subprocess.check_output(cmd, shell=True).decode().strip() except subprocess.CalledProcessError: eprint( f'Failed to retrieve annotation for reagion {self.region_str}')
def run_command(): try: command = sys.argv[1] if command not in callbacks.keys(): eprint('Invalid command:', command) print_help(short=True) return 1 with patch.object(sys, 'argv', sys.argv[1:]): callbacks[command]() except IllegalArgumentError as e: eprint('Invalid input argument\n{}'.format(e)) return 1