def file_logger(namespace, config_file, log_file, log_path_key=None): CONFIG = cl.load_config(config_file) if not log_path_key: log_path = CONFIG['log_dir'] + '/' + log_file else: log_path = CONFIG[log_path_key] + '/' + log_file logger = logging.getLogger(namespace) logger.setLevel(logging.DEBUG) # file handler: fh = logging.FileHandler(log_path) fh.setLevel(logging.INFO) # console handler ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # formatter formatter = logging.Formatter("%(asctime)s (%(levelname)s) : %(message)s") fh.setFormatter(formatter) # add handlers to logger logger.addHandler(ch) logger.addHandler(fh) return logger
def _pair_samples_with_pipelines(run_info_yaml, config): """Map samples defined in input file to pipelines to run. """ samples = config_utils.load_config(run_info_yaml) if isinstance(samples, dict): resources = samples.pop("resources") samples = samples["details"] else: resources = {} ready_samples = [] for sample in samples: if "files" in sample: del sample["files"] # add any resources to this item to recalculate global configuration usample = copy.deepcopy(sample) usample.pop("algorithm", None) if "resources" not in usample: usample["resources"] = {} for prog, pkvs in resources.items(): if prog not in usample["resources"]: usample["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): usample["resources"][prog][key] = val config = config_utils.update_w_custom(config, usample) sample["resources"] = {} ready_samples.append(sample) paired = [(x, _get_pipeline(x)) for x in ready_samples] d = defaultdict(list) for x in paired: d[x[1]].append([x[0]]) return d, config
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def _pair_samples_with_pipelines(run_info_yaml, config): """Map samples defined in input file to pipelines to run. """ samples = config_utils.load_config(run_info_yaml) if isinstance(samples, dict): resources = samples.pop("resources") samples = samples["details"] else: resources = {} ready_samples = [] for sample in samples: if "files" in sample: del sample["files"] # add any resources to this item to recalculate global configuration usample = copy.deepcopy(sample) usample.pop("algorithm", None) if "resources" not in usample: usample["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in usample["resources"]: usample["resources"][prog] = {} for key, val in pkvs.iteritems(): usample["resources"][prog][key] = val config = config_utils.update_w_custom(config, usample) sample["resources"] = {} ready_samples.append(sample) paired = [(x, _get_pipeline(x)) for x in ready_samples] d = defaultdict(list) for x in paired: d[x[1]].append([x[0]]) return d, config
def main(bam_file, config_file=None, chrom='all', start=0, end=None, outfile=None, normalize=False, use_tempfile=False): if config_file: config = load_config(config_file) else: config = {"program": {"ucsc_bigwig" : "wigToBigWig"}} if outfile is None: outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] if start > 0: start = int(start) - 1 if end is not None: end = int(end) regions = [(chrom, start, end)] if os.path.abspath(bam_file) == os.path.abspath(outfile): sys.stderr.write("Bad arguments, input and output files are the same.\n") sys.exit(1) if not (os.path.exists(outfile) and os.path.getsize(outfile) > 0): if use_tempfile: #Use a temp file to avoid any possiblity of not having write permission out_handle = tempfile.NamedTemporaryFile(delete=False) wig_file = out_handle.name else: wig_file = "%s.wig" % os.path.splitext(outfile)[0] out_handle = open(wig_file, "w") with closing(out_handle): chr_sizes, wig_valid = write_bam_track(bam_file, regions, config, out_handle, normalize) try: if wig_valid: convert_to_bigwig(wig_file, chr_sizes, config, outfile) finally: os.remove(wig_file)
def test_programs(self): """Identify programs and versions used in analysis. """ config = load_config( os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) print programs.get_versions(config)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ files = [ os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf") ] ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) with prun.start({ "type": "local", "cores": 1 }, [[config]], config) as run_parallel: vcfutils.parallel_combine_variants(files, self.combo_file, ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ var_dir = os.path.join(self.data_dir, "variants") files = [ os.path.join(var_dir, "S1-variants.vcf"), os.path.join(var_dir, "S2-variants.vcf") ] out_file = os.path.join(var_dir, "S1_S2-combined.vcf") ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config( os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({ "type": "local", "cores": 1 }, {}, config) region_dir = os.path.join(var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(out_file): os.remove(out_file) vcfutils.parallel_combine_variants(files, out_file, ref_file, config, run_parallel)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ from bcbio.variation import vcfutils files = [ os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf") ] with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) reqs = {"type": "local", "cores": 1} with prun.start(reqs, [[config]], config) as run_parallel: vcfutils.parallel_combine_variants( files, self.combo_file, self.ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def test_programs(self): """Identify programs and versions used in analysis. """ with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) print programs._get_versions(config)
def main(config_file, month, year): config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) smonth, syear = (month - 1, year) if month > 1 else (12, year - 1) start_date = datetime(syear, smonth, 15, 0, 0, 0) # last day calculation useful if definition of month is # from first to last day instead of 15th-15th #(_, last_day) = calendar.monthrange(year, month) end_date = datetime(year, month, 14, 23, 59, 59) out_file = "%s_%s" % (start_date.strftime("%b"), end_date.strftime("%b-%Y-sequencing.csv")) with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow([ "Date", "Product", "Payment", "Researcher", "Lab", "Email", "Project", "Sample", "Description", "Genome", "Flowcell", "Lane", "Received", "Notes" ]) for s in galaxy_api.sqn_report(start_date.isoformat(), end_date.isoformat()): f_parts = s["sqn_run"]["run_folder"].split("_") flowcell = "_".join([f_parts[0], f_parts[-1]]) writer.writerow([ s["sqn_run"]["date"], s["sqn_type"], s["project"]["payment_(fund_number)"], s["project"]["researcher"], s["project"]["lab_association"], s["project"]["email"], s["project"]["project_name"], s["name"], s["description"], s["genome_build"], flowcell, s["sqn_run"]["lane"], _received_date(s["events"]), s["sqn_run"]["results_notes"] ])
def main(config_file, fc_dir=None, run_info_yaml=None, numcores=None, paralleltype=None, queue=None, scheduler=None, upgrade=None, profile=None, workflow=None, inputs=None): work_dir = os.getcwd() config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") paralleltype, numcores = _get_cores_and_type(config, fc_dir, run_info_yaml, numcores, paralleltype) parallel = {"type": paralleltype, "cores": numcores, "scheduler": scheduler, "queue": queue, "profile": profile, "module": "bcbio.distributed"} if parallel["type"] in ["local", "messaging-main"]: if numcores is None: config["algorithm"]["num_cores"] = numcores run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "messaging": parallel["task_module"] = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) messaging.run_and_monitor(config, config_file, args, parallel) elif parallel["type"] == "ipython": assert parallel["queue"] is not None, "Ipython parallel requires a specified queue (-q)" run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def test_1_parallel_vcf_combine(self, global_config): """Parallel combination of VCF files, split by chromosome. """ from bcbio.variation import vcfutils files = [ os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf") ] config = load_config(global_config) config["algorithm"] = {} region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) reqs = {"type": "local", "cores": 1} with prun.start(reqs, [[config]], config) as run_parallel: vcfutils.parallel_combine_variants(files, self.combo_file, self.ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def test_programs(self, data_dir): """Identify programs and versions used in analysis. """ from bcbio.provenance import programs with make_workdir() as workdir: config = load_config(get_post_process_yaml(data_dir, workdir)) print programs._get_versions(config)
def main(config_file, fc_dir=None, run_info_yaml=None, numcores=None, paralleltype=None, profile="default"): work_dir = os.getcwd() config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") paralleltype, numcores = _get_cores_and_type(config, fc_dir, run_info_yaml, numcores, paralleltype) parallel = {"type": paralleltype, "cores": numcores, "profile": profile, "module": "bcbio.distributed"} if parallel["type"] in ["local", "messaging-main"]: if numcores is None: config["algorithm"]["num_cores"] = numcores run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "messaging": parallel["task_module"] = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) messaging.run_and_monitor(config, config_file, args, parallel) elif parallel["type"] == "ipython": run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def file_logger(namespace, config_file , log_file, log_path_key = None): CONFIG = cl.load_config(config_file) if not log_path_key: log_path = CONFIG['log_dir'] + '/' + log_file else: log_path = CONFIG[log_path_key] + '/' + log_file logger = logging.getLogger(namespace) logger.setLevel(logging.DEBUG) # file handler: fh = logging.FileHandler(log_path) fh.setLevel(logging.INFO) # console handler ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # formatter formatter = logging.Formatter("%(asctime)s (%(levelname)s) : %(message)s") fh.setFormatter(formatter) # add handlers to logger logger.addHandler(ch) logger.addHandler(fh) return logger
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def coverage(align_bam): config_file = '/home/kwoklab-user/nextgen-python2.7/bcbio-nextgen/bcbio_system.yaml' ref_file = '/media/KwokRaid01/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa' bed_file = '/media/KwokRaid02/nina/ISMB2013/bed_files/capture_regions/130214_HG19_Cardiac_RD_EZ.GRCh37.target.bed' config = load_config(config_file) broad_runner = broad.runner_from_config(config)# broad_runner.run_fn("picard_index_ref", ref_file) broad_runner.run_fn("picard_index", align_bam) broad_runner = broad.runner_from_config(config) config = load_config(config_file) base, _ = os.path.splitext(os.path.basename(align_bam)) work_dir = os.path.dirname(align_bam) out_file = os.path.join( work_dir , base ) params = [ "-R", ref_file] # with file_transaction(out_file) as tx_out_file: params += ["-T", "DepthOfCoverage", "-o", out_file, "-I", align_bam, "-L", bed_file] broad_runner.run_gatk(params) return out_file
def main(local_config, post_config_file=None, process_msg=True, store_msg=True, qseq=True, fastq=True): config = load_config(local_config) log_handler = create_log_handler(config) with log_handler.applicationbound(): search_for_new(config, local_config, post_config_file, process_msg, store_msg, qseq, fastq)
def load_couch_server(config_file): """loads couch server with settings specified in 'config_file'""" try: db_conf = cl.load_config(config_file)["statusdb"] url = db_conf["username"] + ":" + db_conf["password"] + "@" + db_conf["url"] + ":" + str(db_conf["port"]) couch = couchdb.Server("http://" + url) return couch except KeyError: raise RuntimeError('"statusdb" section missing from configuration file.')
def load_couch_server(config_file): """loads couch server with settings specified in 'config_file'""" try: db_conf = cl.load_config(config_file)['statusdb'] url = db_conf['username']+':'+db_conf['password']+'@'+db_conf['url']+':'+str(db_conf['port']) couch = couchdb.Server("http://" + url) return couch except KeyError: raise RuntimeError("\"statusdb\" section missing from configuration file.")
def load_couch_server(config_file): """loads couch server with settings specified in 'config_file'""" try: db_conf = cl.load_config(config_file)['statusdb'] url = db_conf['username'] + ':' + db_conf['password'] + '@' + db_conf[ 'url'] + ':' + str(db_conf['port']) couch = couchdb.Server("http://" + url) return couch except KeyError: raise RuntimeError( "\"statusdb\" section missing from configuration file.")
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def test_2_vcf_exclusion(self, global_config): """Exclude samples from VCF files. """ from bcbio.variation import vcfutils config = load_config(global_config) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, self.ref_file, config)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ fname = os.path.join(self.data_dir, "variants", "S1_S2-combined.vcf") ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) out_file = "%s-exclude%s" % os.path.splitext(fname) to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(fname, out_file, to_exclude, ref_file, config)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, self.ref_file, config)
def analyze_locally(dname, post_config_file, fastq_dir): """Run analysis directly on the local machine. """ assert fastq_dir is not None post_config = load_config(post_config_file) analysis_dir = os.path.join(fastq_dir, os.pardir, "analysis") utils.safe_makedir(analysis_dir) with utils.chdir(analysis_dir): prog = "bcbio_nextgen.py" cl = [prog, post_config_file, dname] run_yaml = os.path.join(dname, "run_info.yaml") if os.path.exists(run_yaml): cl.append(run_yaml) subprocess.check_call(cl)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ from bcbio.variation import vcfutils with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples( self.combo_file, out_file, to_exclude, self.ref_file, config)
def test_3_vcf_split_combine(self, global_config): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils config = load_config(global_config) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ # Be back compatible with 0.7.6 -- remove after 0.7.7 release if prun is None: return ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_2_vcf_exclusion(self): """Exclude samples from VCF files. """ # Be back compatible with 0.7.6 -- remove after 0.7.7 release if prun is None: return ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) config["algorithm"] = {} out_file = utils.append_stem(self.combo_file, "-exclude") to_exclude = ["S1"] if os.path.exists(out_file): os.remove(out_file) vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ var_dir = os.path.join(self.data_dir, "variants") files = [os.path.join(var_dir, "S1-variants.vcf"), os.path.join(var_dir, "S2-variants.vcf")] out_file = os.path.join(var_dir, "S1_S2-combined.vcf") ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({"type": "local", "cores": 1}, {}, config) region_dir = os.path.join(var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(out_file): os.remove(out_file) vcfutils.parallel_combine_variants(files, out_file, ref_file, config, run_parallel)
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format(queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def main(config_file, fc_dir=None, run_info_yaml=None, numcores=None, paralleltype=None, queue=None, scheduler=None, upgrade=None, profile=None, workflow=None, inputs=None): work_dir = os.getcwd() config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") paralleltype, numcores = _get_cores_and_type(config, fc_dir, run_info_yaml, numcores, paralleltype) parallel = { "type": paralleltype, "cores": numcores, "scheduler": scheduler, "queue": queue, "profile": profile, "module": "bcbio.distributed" } if parallel["type"] in ["local", "messaging-main"]: if numcores is None: config["algorithm"]["num_cores"] = numcores run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "messaging": parallel["task_module"] = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) messaging.run_and_monitor(config, config_file, args, parallel) elif parallel["type"] == "ipython": assert parallel[ "queue"] is not None, "IPython parallel requires a specified queue (-q)" assert parallel[ "scheduler"] is not None, "IPython parallel requires a specified scheduler (-s)" run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) fc_name, fc_date = flowcell.parse_dirname(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = flowcell.get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map( _process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]) ) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format( queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ files = [os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf")] ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({"type": "local", "cores": 1}, {}, config) region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) vcfutils.parallel_combine_variants(files, self.combo_file, ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def count(bam_file): bed_file = '/media/KwokRaid02/nina/ISMB2013/bed_files/auto_exon.bed' ref_file = '/media/KwokRaid01/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa' config_file = '/home/kwoklab-user/nextgen-python2.7/bcbio-nextgen/bcbio_system.yaml' gene_file = '/home/kwoklab-user/nina/ISMB2013/bed_files/geneTrack.ensembl_mergedSorted_noChrMnumericCHR.txt' work_dir = '/media/KwokData02/ISMB2013-analysis' config = load_config(config_file) broad_runner = broad.runner_from_config(config) base, _ = os.path.splitext(os.path.basename(bam_file)) # gtf_file = '/media/KwokRaid01/biodata/genomes/Hsapiens/GRCh37/ref-transcripts.gtf' # out_file = os.path.join( work_dir , base + ".counts") # htseq = 'htseq-count' # in_file = sam_to_querysort_sam(bam_file, config) # with file_transaction(out_file) as tmp_out_file: # htseq_cmd = ("{htseq} --mode=union --stranded=no --type=exon --idattr=gene_id {in_file} {gtf_file} > {tmp_out_file}") # cmd = htseq_cmd.format(**locals()) # print cmd # subprocess.check_call(cmd, shell=True) depth_of_coverage_file = os.path.join( work_dir , base + '.doc' ) params = [ "-R", ref_file] params += ["-T", "DepthOfCoverage", "-o", depth_of_coverage_file, "-I", bam_file, "-L", bed_file, "-geneList",gene_file] broad_runner.run_gatk(params) # diagnose_file = os.path.join( work_dir , base + '.DiagnoseTargets.vcf') # params = [ "-R", ref_file] # params += ["-T", "DiagnoseTargets", "-o", diagnose_file, "-I", bam_file, "-L", bed_file] # broad_runner.run_gatk(params) # callable_file = os.path.join( work_dir , base + '.callable.bed') params = [ "-R", ref_file] params += ["-T", "CallableLoci", "-o", callable_file, "-I", bam_file, "-L", bed_file] broad_runner.run_gatk(params) GCcontent_file = os.path.join( work_dir , base + '.GCcontent.bed') params = [ "-R", ref_file] params += ["-T", "GCContentByInterval", "-o", GCcontent_file, "-L", bed_file] broad_runner.run_gatk(params) return True
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ # Be back compatible with 0.7.6 -- remove after 0.7.7 release if prun is None: return files = [os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf")] ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) with prun.start({"type": "local", "cores": 1}, [[config]], config) as run_parallel: vcfutils.parallel_combine_variants(files, self.combo_file, ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def main(config_file, month, year): config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) smonth, syear = (month - 1, year) if month > 1 else (12, year - 1) start_date = datetime(syear, smonth, 15, 0, 0, 0) # last day calculation useful if definition of month is # from first to last day instead of 15th-15th #(_, last_day) = calendar.monthrange(year, month) end_date = datetime(year, month, 14, 23, 59, 59) out_file = "%s_%s" % (start_date.strftime("%b"), end_date.strftime("%b-%Y-sequencing.csv")) with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow([ "Date", "Product", "Payment", "Researcher", "Lab", "Email", "Project", "Sample", "Description", "Genome", "Flowcell", "Lane", "Received", "Notes"]) for s in galaxy_api.sqn_report(start_date.isoformat(), end_date.isoformat()): f_parts = s["sqn_run"]["run_folder"].split("_") flowcell = "_".join([f_parts[0], f_parts[-1]]) writer.writerow([ s["sqn_run"]["date"], s["sqn_type"], s["project"]["payment_(fund_number)"], s["project"]["researcher"], s["project"]["lab_association"], s["project"]["email"], s["project"]["project_name"], s["name"], s["description"], s["genome_build"], flowcell, s["sqn_run"]["lane"], _received_date(s["events"]), s["sqn_run"]["results_notes"]])
\FloatBarrier """ _base_template = r""" \documentclass{article} \usepackage{fullpage} \usepackage{graphicx} \usepackage{placeins} \begin{document} % for part in parts: ${part} % endfor \end{document} """ if __name__ == "__main__": # Handle arguments parser = argparse.ArgumentParser(description="Generate the summary pdf (as in earlier versions of the pipeline.)") parser.add_argument("bam_file", type=str, help="The analysis ready bam file.") parser.add_argument("fasta_ref", type=str, help="The reference fasta file.") parser.add_argument("bait_file", type=str, help="The bed file detailing the bait/target region.") parser.add_argument("sample_name", type=str, help="The name of the sample to appear in the summary.") parser.add_argument("config_file", type=str, help="The system configuration file used in the bcbio pipeline.") parser.add_argument("output_dir", type=str, help="The directory where the output will be written.") args = parser.parse_args() config = config_utils.load_config(args.config_file) variant_align_summary(args.bam_file, args.fasta_ref, args.bait_file, args.sample_name, config, args.output_dir)
def test_programs(self, global_config): """Identify programs and versions used in analysis. """ from bcbio.provenance import programs print(programs._get_versions(load_config(global_config)))
def test_programs(self): """Identify programs and versions used in analysis. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) print programs._get_versions(config)
def test_programs(self): """Identify programs and versions used in analysis. """ config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) print programs.get_versions(config)