def view_gr(pat, args, get_cmd=False): validate_single_file(pat, '.pat.gz') gr = GenomicRegion(args) if gr.is_whole(): s = 1 e = gr.genome.get_nr_sites() + 1 cmd = f'gunzip -c {pat} ' else: s, e = gr.sites ms = max(1, s - MAX_PAT_LEN) cmd = f'tabix {pat} {gr.chrom}:{ms}-{e - 1} ' view_flags = set_view_flags(args) cmd += f' | {cview_tool} --sites "{s}\t{e}" ' + view_flags if hasattr( args, 'sub_sample') and args.sub_sample is not None: # sub-sample reads validate_local_exe(pat_sampler) cmd += f' | {pat_sampler} {args.sub_sample} ' if not gr.is_whole(): cmd += f' | sort -k2,2n -k3,3 ' cmd += f' | {collapse_pat_script} - ' if get_cmd: return cmd if args.out_path is not None: cmd += f' > {args.out_path}' subprocess_wrap_sigpipe(cmd)
def break_to_chunks(self): """ Break range of sites to chunks of size 'step', while keeping chromosomes separated """ # print a warning in case chunk size is too small step = self.args.chunk_size if step < self.args.max_cpg: msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \ ' It may cause wt segment to fail. It\'s best setting\n' \ ' chunk_size > min{max_cpg, max_bp/2}' eprint(msg) if self.args.bed_file: df = load_blocks_file(self.args.bed_file)[['startCpG', 'endCpG']].dropna() # make sure bed file has no overlaps or duplicated regions is_nice, msg = is_block_file_nice(df) if not is_nice: msg = '[wt segment] ERROR: invalid bed file.\n' \ f' {msg}\n' \ f' Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \ 'bedtools merge -i - | wgbstools convert --drop_empty -p -L -' eprint(msg) raise IllegalArgumentError('Invalid bed file') if df.shape[0] > 2 * 1e4: msg = '[wt segment] WARNING: bed file contains many regions.\n' \ ' Segmentation will take a long time.\n' \ f' Consider running w/o -L flag and intersect the results\n' eprint(msg) else: # No bed file provided gr = GenomicRegion(self.args) # whole genome - make a dummy "bed file" of the full chromosomes if gr.is_whole(): cf = self.genome.get_chrom_cpg_size_table() cf['endCpG'] = np.cumsum(cf['size']) + 1 cf['startCpG'] = cf['endCpG'] - cf['size'] df = cf[['startCpG', 'endCpG']] # one region else: df = pd.DataFrame(columns=['startCpG', 'endCpG'], data=[gr.sites]) # build a DataFrame of chunks, with a "tag"/label field, # so we know which chunks to merge later on. rf = pd.DataFrame() tags = [] starts = [] ends = [] for ind, row in df.iterrows(): start, end = row bords = list(range(start, end, step)) + [end] tags += [f'{start}-{end}'] * (len(bords) - 1) starts += bords[:-1] ends += bords[1:] return tags, starts, ends
def __init__(self, args): self.args = args self.gr = GenomicRegion(args) self.debug = args.debug self.outdir = args.outdir if not op.isdir(self.outdir): raise IllegalArgumentError('Invalid output directory: ' + self.outdir) self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes self.ref_dict = self.load_dict()
def insert_borders(self, markers): ctable = self.fullres['table'] start = self.fullres['start'] # load borders from file # build gr to span the whole table bsites = '{}-{}'.format(start, start + ctable.shape[1]) table_gr = GenomicRegion(sites=bsites, genome_name=self.gr.genome_name) borders = load_borders(self.blocks_path, table_gr, self.args.genome) if not borders.size: return self.fullres['text'], markers # pad right columns with space, if there are missing sites before the last border/s missing_width = borders[-1] - ctable.shape[1] if missing_width > 0: charar = np.chararray((ctable.shape[0], missing_width)) charar[:] = ' ' ctable = np.concatenate([ctable, charar], axis=1) # insert the borders: txt = table2text(np.insert(ctable, borders, BORDER, axis=1)) # insert the borders to the markers line: markers_arr = np.array(list(markers.ljust(ctable.shape[1])))[:, None] rmark = ''.join(np.insert(markers_arr, borders, BORDER)) return txt, rmark
def __init__(self, args): self.args = args self.out_dir = args.out_dir self.bam_path = args.bam_path self.debug = args.debug self.gr = GenomicRegion(args) self.validate_input()
def convert_single_region(args): gr = GenomicRegion(args) if args.parsable: r = gr.region_str if args.sites else '{}-{}'.format(*gr.sites) else: r = gr print(r)
def __repr__(self): base_str = repr(self.base_contig ) + '\n' if self.generate_type == 'backward' else '' repeat_str = (repr(GenomicRegion()) + repr(SVLink())) * self.margin + repr(self.repeat_contig) repeat_str_list = [repeat_str] * self.repeat_time return base_str + '\n'.join(repeat_str_list)
def compare_all_paires(args): betas = args.betas sites = GenomicRegion(args).sites tables = [load_beta_data(b, sites) for b in betas] names = [op.splitext(op.basename(b))[0] for b in betas] # break names to lines nnames = [] k = 20 for n in names: lst = [n[0 + i:k + i] for i in range(0, len(n), k)] nn = '\n'.join(lst) nnames.append(nn) N = len(tables) fig, axs = plt.subplots(N, N) for i in range(N): for j in range(i + 1): comp2(tables[i], tables[j], args.min_cov, axs[i, j]) axs[i, 0].set_ylabel(nnames[i], fontsize=8) for j in range(N): axs[0, j].set_title(nnames[j], fontsize=8) for ax in axs.flat: ax.label_outer() fig.tight_layout() if args.outpath is not None: plt.savefig(args.outpath) eprint(f'[wt cmp] dumped figure to {args.outpath}') if args.show or args.outpath is None: plt.show()
def main(): """ View the content of input file (pat/beta) as plain text. Possible filter by genomic region or sites range Output to stdout as default """ parser = parse_args() args = parser.parse_args() if args.sub_sample is not None and not 1 >= args.sub_sample >= 0: parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]') # validate input file input_file = args.input_file validate_single_file(input_file) try: if input_file.endswith('.beta'): gr = GenomicRegion(args) view_beta(input_file, gr, args.out_path, args.bed_file) elif op.splitext(input_file)[1] in ('.lbeta', '.bin'): view_other_bin(input_file, args) elif input_file.endswith('.pat.gz'): cview(input_file, args) else: raise IllegalArgumentError('Unknown input format:', input_file) except BrokenPipeError: catch_BrokenPipeError()
def __init__(self, args): self.gr = GenomicRegion(args) self.start, self.end = self.gr.sites self.nr_sites = self.end - self.start self.args = args # load distances self.distances = self.load_pairwise_dists() if args.dists else None # drop duplicated files, while keeping original order seen = set() self.files = [x for x in args.input_files if not (x in seen or seen.add(x))] # load raw data: self.dsets = self.load_data() # load borders: self.borders = load_borders(args.blocks_path, self.gr) if args.blocks_path else None # Generate colors dictionary self.num2color_dict = generate_colors_dict(args.color_scheme) self.print_all() if self.args.plot: self.plot_all()
def main(): """ Calculate the average coverage of one or more beta files. Print the results. """ args = parse_args() sites = GenomicRegion(args).sites blocks_df = load_blocks_file(args.bed_file) if args.bed_file else None params = [(beta, sites, blocks_df, False) for beta in args.betas] # covs = [beta_cov(*p) for p in params] # return p = Pool(args.threads) covs = p.starmap(beta_cov, params) p.close() p.join() for cov, beta_path in zip(covs, args.betas): print('{}\t{:.2f}'.format(pretty_name(beta_path), cov)) if args.plot: plot_hist([pretty_name(b) for b in args.betas], covs)
def __init__(self, unq, args, gr=None): self.args = args self.unq = unq self.gr = gr if gr else GenomicRegion(args) m = args.max_frag_size self.fill_arr_cmd = ' {if ($3 > %s) {$3 = %s}; arr[$3] += $5}' % (m, m) self.print_arr_cmd = ' END {for (x=1; x <= %s; x++) print arr[x]}\'' % m
def main(): """ Convert beta file to bed file. """ args = parse_args() validate_single_file(args.beta_path, '.beta') gr = GenomicRegion(args) beta_to_bed(args.beta_path, gr, args.bed_file, args.min_cov, args.mean, args.keep_na, args.force, args.outpath)
def __init__(self, args): eprint('mixing...') self.args = args self.gr = GenomicRegion(args) self.pats = args.pat_files self.dest_cov = args.cov self.bed = load_blocks_file(args.bed_file) if args.bed_file else None self.stats = pd.DataFrame( index=[splitextgz(op.basename(f))[0] for f in self.pats]) self.nr_pats = len(self.pats) self.labels = self.validate_labels(args.labels) self.dest_rates = self.validate_rates(args.rates) self.covs = self.read_covs() self.adj_rates = self.adjust_rates() self.prefix = self.generate_prefix(args.out_dir, args.prefix)
def main(args): validate_files_list(args.input_files, '.pat.gz') gr = GenomicRegion(args) print(gr) for pat_file in args.input_files: print(splitextgz(op.basename(pat_file))[0]) # print file name PatVis(args, pat_file).print_results()
def __init__(self, args, bam): self.args = args self.tmp_dir = None self.verbose = args.verbose self.out_dir = args.out_dir self.bam_path = bam self.gr = GenomicRegion(args) self.start_threads() self.cleanup()
def merge_pats(self): view_flags = [] for i in range(len(self.pats)): v = ' ' if self.args.strict: v += ' --strict' if self.args.min_len: v += ' --min_len {}'.format(self.args.min_len) if self.args.bed_file is not None: v += ' -L {}'.format(self.args.bed_file) gr = GenomicRegion(self.args) if not gr.is_whole(): v += ' -s {}-{}'.format(*gr.sites) # v += ' -@ {}'.format(max(1, len(self.pats) // 16)) view_flags.append(v) if not view_flags: view_flags = None self.fast_merge_pats(view_flags)
def __init__(self, args): self.args = args self.gr = GenomicRegion(args) self.outdir = args.outdir self.name = '' if not op.isdir(self.outdir): raise IllegalArgumentError('Invalid output directory: ' + self.outdir) self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
def main(): """ Compare between pairs of beta files, by plotting a 2d histogram for every pair. Drop sites with low coverage (< cov_thresh argument), for performance and robustness. """ args = parse_args() validate_files_list(args.betas, '.beta', min_len=2) compare_all_paires(args.betas, args.min_cov, GenomicRegion(args).sites)
def __init__(self, args, bam): self.args = args self.tmp_dir = None self.verbose = args.verbose self.out_dir = args.out_dir self.bam_path = bam self.homog_prop = args.homog_prop self.min_cpg = args.min_cpg self.gr = GenomicRegion(args) self.start_threads()
def __init__(self, args, pat_path): self.gr = GenomicRegion(args) self.args = args self.max_reps = args.max_reps if args.max_reps > 0 else sys.maxsize self.start, self.end = self.gr.sites self.pat_path = pat_path self.blocks_path = args.blocks_path self.uxm = args.uxm self.uxm_counts = {'U': 0, 'X': 0, 'M': 0} self.fullres = self.get_block()
def view_pat_mult_proc(input_file, strict, sub_sample, grs, i, step): res = [] for i in range(i, min(len(grs), i + step)): gr = GenomicRegion(region=grs[i]) cmd = ViewPat(input_file, sys.stdout, gr, strict, sub_sample).compose_awk_cmd() x = subprocess.check_output(cmd, shell=True) # print('x', cmd, x) res.append(x) return res
def main(args): validate_file_list(args.input_files, '.pat.gz') # drop duplicated files, while keeping original order input_files = drop_dup_keep_order(args.input_files) gr = GenomicRegion(args) print(gr) for pat_file in input_files: print(splitextgz(op.basename(pat_file))[0]) # print file name PatVis(args, pat_file).print_results()
def __init__(self, args, file): self.gr = GenomicRegion(args) self.max_reps = args.max_reps if args.max_reps > 0 else sys.maxsize self.strict = args.strict self.min_len = args.min_len self.start, self.end = self.gr.sites self.file = file self.no_color = args.no_color self.max_width = self.end - self.start + 2 * MAX_PAT_LEN # maximal width of the output (in characters) self.blocks_path = args.blocks_path self.no_dense = args.no_dense self.fullres = self.get_block()
def main(): """ Test whether region is bimodal """ parser = add_args() args = parse_args(parser) if args.bed_file is not None: test_multiple_regions(args.bed_file, args.pat, args.threads, args.out_file, args.strict, args.min_len, args.verbose) else: gr = GenomicRegion(args) test_single_region(args.pat, gr.chrom, gr.sites, args.strict, args.min_len)
def slow_conversion(df, genome): df = df.iloc[:, :3] startCpGs = [] endCpGs = [] for ind, row in df.iterrows(): try: sites = GenomicRegion(region='{}:{}-{}'.format(*row), genome_name=genome).sites except IllegalArgumentError as e: sites = (np.nan, np.nan) startCpGs.append(sites[0]) endCpGs.append(sites[1]) df['startCpG'] = pd.Series(startCpGs, dtype='Int64').values df['endCpG'] = pd.Series(endCpGs, dtype='Int64').values return df
def main(): """ Convert genomic region to CpG index range and vise versa """ args = parse_args() if args.bed_path and (args.region or args.sites): eprint('-L, -s and -r are mutually exclusive') return if args.bed_path: convert_bed_file(args) return print(GenomicRegion(args))
def __init__(self, args): print('in mixer') self.args = args self.gr = GenomicRegion(args) self.pats = args.pat_files self.dest_cov = args.cov self.bed = None if not args.bed_file else BedFileWrap(args.bed_file) self.stats = pd.DataFrame( index=[splitextgz(op.basename(f))[0] for f in self.pats]) self.nr_pats = len(self.pats) self.labels = self.validate_labels(args.labels) self.dest_rates = self.validate_rates(args.rates) self.covs = self.read_covs() self.adj_rates = self.adjust_rates() self.prefix = self.generate_prefix(args.out_dir, args.prefix)
def multi_FragLen(args): if args.bed_file and (args.region or args.sites): eprint('-L, -s and -r are mutually exclusive') return if args.region or args.sites: grs = [GenomicRegion(args)] elif args.bed_file: grs = BedFileWrap(args.bed_file).iter_grs() else: grs = [] for unq in args.unq_paths: run_single_unq(unq, grs, args) if args.display: plt.show()
def main(): """ View the content of input file (pat/unq/beta) as plain text. Possible filter by genomic region or sites range Output to stdout as default """ args = parse_args() # validate input file input_file = args.input_file validate_single_file(input_file) if args.sub_sample is not None and not 1 > args.sub_sample > 0: eprint('sub-sampling rate must be within (0.0, 1.0)') return if args.bed_file and (args.region or args.sites): eprint('-L, -s and -r are mutually exclusive') return bed_wrapper = BedFileWrap(args.bed_file) if args.bed_file else None gr = GenomicRegion(args) try: if input_file.endswith('.beta') or input_file.endswith('.bin'): view_beta(input_file, gr, args.out_path) elif input_file.endswith('.pat.gz'): if bed_wrapper: view_pat_bed_multiprocess(args, bed_wrapper) else: vp = ViewPat(input_file, args.out_path, gr, args.strict, args.sub_sample, bed_wrapper, args.min_len) vp.view_pat(args.awk_engine) elif input_file.endswith('.unq.gz'): grs = bed_wrapper.iter_grs() if bed_wrapper else [gr] for gr in grs: ViewUnq(input_file, args.out_path, gr, args.inflate).view() else: raise IllegalArgumentError('Unknown input format:', input_file) except BrokenPipeError: # Python flushes standard streams on exit; redirect remaining output # to devnull to avoid another BrokenPipeError at shutdown devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) sys.exit(1) # Python exits with error code 1 on EPIPE