def split_bed_to_chrom_bed_parallel(bed_files, out_dir, parallel=12): """split a list of bed files into chromosome bed files, in parallel """ # put commands in queue split_queue = setup_multiprocessing_queue() for bed_file in bed_files: prefix = os.path.basename(bed_file).split(".narrowPeak")[0].split( ".bed")[0] split_args = [out_dir, bed_file, prefix] split_queue.put([split_bed_to_chrom_bed, split_args]) # run the queue run_in_parallel(split_queue, parallel=parallel, wait=True) return None
def run_crawl(cr_job): cr_agent = cr_job.crawl_agent url_tuples = cr_job.url_tuples # only copy the variables that'll be used by the agent. Parallelization requires picklable variables. cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \ ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \ 'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \ 'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__]) worker = partial(crawl_worker, cfg_dict) parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs) lp.close_index_html(cr_job.index_html_log)
def bin_regions_parallel(bed_files, out_dir, chromsizes, bin_size=200, stride=50, final_length=1000, parallel=12): """bin in parallel """ split_queue = setup_multiprocessing_queue() for bed_file in bed_files: prefix = os.path.basename(bed_file).split(".narrowPeak")[0].split( ".bed")[0] split_args = [ bed_file, "{}/{}".format(out_dir, prefix), bin_size, stride, final_length, chromsizes, "naive" ] split_queue.put([bin_regions_sharded, split_args]) # run the queue run_in_parallel(split_queue, parallel=parallel, wait=True) return None
def parse_crawl_logs(path, no_of_procs=16): files = fu.gen_find_files("*.txt", path) log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html) parallelize.run_in_parallel(files, log_worker, no_of_procs) wl_log.info("Worker processes are finished, will generate index")
def generate_h5_datasets( positives_bed_file, ref_fasta, chromsizes, label_files, signal_files, prefix, work_dir, bin_size=200, stride=50, final_length=1000, superset_bed_file=None, reverse_complemented=False, genome_wide=False, parallel=24, tmp_dir=".", normalize_signals=False): """generate a full h5 dataset """ if True: # first select negatives training_negatives_bed_file, genomewide_negatives_bed_file = setup_negatives( positives_bed_file, superset_bed_file, chromsizes, bin_size=bin_size, stride=stride, genome_wide=genome_wide, tmp_dir=tmp_dir) # collect the bed files if genome_wide: all_bed_files = [ positives_bed_file, training_negatives_bed_file, genomewide_negatives_bed_file] else: all_bed_files = [ positives_bed_file, training_negatives_bed_file] # split to chromosomes chrom_dir = "{}/by_chrom".format(tmp_dir) os.system("mkdir -p {}".format(chrom_dir)) split_bed_to_chrom_bed_parallel( all_bed_files, chrom_dir, parallel=parallel) # split to equally sized bin groups chrom_files = glob.glob("{}/*.bed.gz".format(chrom_dir)) bin_dir = "{}/bin-{}.stride-{}".format(tmp_dir, bin_size, stride) os.system("mkdir -p {}".format(bin_dir)) bin_regions_parallel( chrom_files, bin_dir, chromsizes, bin_size=bin_size, stride=stride, parallel=parallel) # grab all of these and process in parallel h5_dir = "{}/h5".format(work_dir) os.system("mkdir -p {}".format(h5_dir)) chrom_bed_files = glob.glob("{}/*.filt.bed.gz".format(bin_dir)) logging.info("Found {} bed files".format(chrom_bed_files)) h5_queue = setup_multiprocessing_queue() for bed_file in chrom_bed_files: prefix = os.path.basename(bed_file).split(".bed")[0].split(".narrowPeak")[0] h5_file = "{}/{}.h5".format(h5_dir, prefix) if os.path.isfile(h5_file): continue parallel_tmp_dir = "{}/{}_tmp".format(tmp_dir, prefix) process_args = [ bed_file, ref_fasta, chromsizes, h5_file, label_files, signal_files, bin_size, stride, final_length, reverse_complemented, "features", parallel_tmp_dir] h5_queue.put([setup_h5_dataset, process_args]) # run the queue run_in_parallel(h5_queue, parallel=parallel, wait=True) # also tag each file with the chromosome and positives, negatives, etc h5_dir = "{}/h5".format(work_dir) h5_files = glob.glob("{}/*h5".format(h5_dir)) for h5_file in h5_files: chrom = os.path.basename(h5_file).split(".")[-4] example_type = os.path.basename(h5_file).split(".")[-5] if example_type == "master": example_type = "positives" with h5py.File(h5_file, "a") as hf: hf["/"].attrs[_CHROM_TAG] = [chrom] hf["/"].attrs[_EXAMPLE_TYPE_TAG] = example_type return None