def main(): """ Collapse beta file to blocks binary file, of the same beta format """ args = parse_args() files = args.input_files validate_file_list(files, '.beta') validate_out_dir(args.out_dir) if not args.force: files = filter_existing_files(files, args.out_dir, args.lbeta) # load blocks: # b2b_log('load blocks...') df = load_blocks_file(args.blocks_file) is_nice, msg = is_block_file_nice(df) if not is_nice: b2b_log(msg) p = Pool(args.threads) params = [(b, df, is_nice, args.lbeta, args.out_dir, args.bedGraph) for b in files] arr = p.starmap(collapse_process, params) p.close() p.join()
def main(): """ Generage homog files. Given a blocks file and pat[s], count the number of U,X,M reads for each block for each file """ args = parse_args() if args.nr_bits not in (8 , 16): raise IllegalArgumentError('nr_bits must be in {8, 16}') if args.rlen < 3: raise IllegalArgumentError('rlen must be >= 3') if args.thresholds is not None: th = args.thresholds.split(',') if not len(th) == 2: # and th[0].is_number(): raise IllegalArgumentError('Invalid thresholds') th = float(th[0]), float(th[1]) if not (1 > th[1] > th[0] > 0): raise IllegalArgumentError('Invalid thresholds') # make sure homog tool is valid: validate_local_exe(homog_tool) pats = args.input_files validate_file_list(pats, '.pat.gz') outdir, prefix = parse_outdir_prefix(args) # load blocks: blocks_df = load_blocks_file(args.blocks_file) is_nice, msg = is_block_file_nice(blocks_df) if not is_nice: homog_log(msg) raise IllegalArgumentError(f'Invalid blocks file: {args.blocks_file}') for pat in sorted(pats): homog_process(pat, blocks_df, args, outdir, prefix)
def main(): """ Plot histogram of reads lengths (in sites) of pat file Output to stdout the histogram values if requested """ args = parse_args() validate_file_list(args.pat_paths, '.pat.gz') multi_FragLen(args)
def main(): """ Convert bed[.gz] file[s] to beta file[s]. bed file should be of the format (tab-separated): chr start end #meth #total """ args = parse_args() validate_file_list(args.bed_paths) bed2betas(args)
def main(): """ Mix samples from K different pat files. Output a single mixed pat.gz[.csi] file - sorted, bgzipped and indexed - with an informative name. """ args = parse_args() validate_file_list(args.pat_files, 'pat.gz', 2) mult_mix(args) return
def main(): """ Compare between pairs of beta files, by plotting a 2d histogram for every pair. Drop sites with low coverage (< cov_thresh argument), for performance and robustness. """ args = parse_args() validate_file_list(args.betas, '.beta', min_len=2) compare_all_paires(args)
def main(args): validate_file_list(args.input_files, '.pat.gz') # drop duplicated files, while keeping original order input_files = drop_dup_keep_order(args.input_files) gr = GenomicRegion(args) print(gr) for pat_file in input_files: print(splitextgz(op.basename(pat_file))[0]) # print file name PatVis(args, pat_file).print_results()
def main(): """ Convert beta file[s] to Illumina-450K format. Output: a csv file with ~480K rows, for the ~480K Illumina sites, and with columns corresponding to the beta files. all values are in range [0, 1], or NA. Only works for hg19. """ args = parse_args() validate_file_list(args.input_files, '.beta') betas2csv(args)
def validate_args(self): # validate integers if self.min_cpg < 0: raise IllegalArgumentError('min_cpg must be non negative') if self.max_cpg < 1: raise IllegalArgumentError('max_cpg must larger than 0') if self.min_bp < 0: raise IllegalArgumentError('min_bp must be non negative') if self.max_bp < 2: raise IllegalArgumentError('max_bp must larger than 1') if self.chunk_size < 1: raise IllegalArgumentError('chunk_size must larger than 1') # validate the [0.0, 1.0] fractions for key in ('na_rate_tg', 'na_rate_bg', 'delta', 'tg_quant', \ 'bg_quant', 'unmeth_thresh', 'meth_thresh', \ 'unmeth_mean_thresh', 'meth_mean_thresh'): if not (1.0 >= getattr(self, key) >= 0): eprint( f'Invalid value for {key} ({val}): must be in ({low}, {high})' ) raise IllegalArgumentError() # validate hyper hypo: if self.only_hyper and self.only_hypo: eprint(f'at most one of (only_hyper, only_hypo) can be specified') raise IllegalArgumentError() # validate input files for key in ('blocks_path', 'groups_file'): val = getattr(self, key) if val is None: eprint(f'[wt fm] missing required parameter: {key}') raise IllegalArgumentError() validate_single_file(val) # change path to absolute path setattr(self, key, op.abspath(val)) # validate betas if (self.betas is None and self.beta_list_file is None) or \ (self.betas is not None and self.beta_list_file is not None): eprint( f'[wt fm] Exactly one of the following must be specified: betas, beta_list_file' ) raise IllegalArgumentError() if self.beta_list_file: validate_single_file(self.beta_list_file) with open(self.beta_list_file, 'r') as f: self.betas = [l.strip() for l in f.readlines()] validate_file_list(self.betas)
def main(): """ Convert beta file[s] to bigwig file[s]. Assuming bedGraphToBigWig is installed and in PATH """ args = parse_args() validate_file_list(args.beta_paths, '.beta') if not check_executable('bedGraphToBigWig', verbose=True): return b = BetaToBigWig(args) for beta in args.beta_paths: b.run_beta_to_bw(beta)
def groups_load_wrap(groups_file, betas): if groups_file is not None: validate_single_file(groups_file) validate_file_list(betas) gf = load_gfile_helper(groups_file) else: # otherwise, generate dummy group file for all binary files in input_dir # first drop duplicated files, while keeping original order betas = drop_dup_keep_order(betas.copy()) fnames = [op.splitext(op.basename(b))[0] for b in betas] gf = pd.DataFrame(columns=['fname'], data=fnames) gf['group'] = gf['fname'] gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta') return gf
def merge_betas(betas, opath): """ Merge all betas by summing their values element-wise, while keeping the dimensions :param betas: list of beta files :param opath: merged beta file """ validate_file_list(betas, force_suff='.beta') data = load_beta_data(betas[0]).astype(np.int) for b in betas[1:]: data += load_beta_data(b) # Trim / normalize to range [0, 256) data = trim_to_uint8(data) # Dump data.tofile(opath) return data
def parse_betas_input(args): """ parse user input to get the list of beta files to segment Either args.betas is a list of beta files, or args.beta_file is a text file in which each line is a beta file return: list of beta files """ if args.betas: betas = args.betas elif args.beta_file: validate_single_file(args.beta_file) with open(args.beta_file, 'r') as f: betas = [ b.strip() for b in f.readlines() if b.strip() and not b.startswith('#') ] if not betas: raise IllegalArgumentError( f'no beta files found in file {args.beta_file}') validate_file_list(betas) return betas
def __init__(self, pats, outpath, labels, args): self.args = args self.pats = pats validate_file_list(self.pats, force_suff='.pat.gz') self.outpath = outpath self.labels = labels
def main(args): validate_file_list(args.input_files) #, '.beta') try: BetaVis(args) except BrokenPipeError: catch_BrokenPipeError()
def load_group_file(groups_file, betas): validate_single_file(groups_file) validate_file_list(betas) gf = load_gfile_helper(groups_file) gf['full_path'] = match_prefix_to_bin(gf['fname'], betas, '.beta') return gf