def run_analysis(work_dir, post_process, fc_dir, run_info): """Changes into the supplied work_dir directory and submits the job using the supplied arguments and with slurm parameters obtained from the post_process.yaml configuration """ # Move to the working directory start_dir = os.getcwd() os.chdir(work_dir) config = load_config(post_process) if str(config["algorithm"]["num_cores"]) == "messaging": analysis_script = DISTRIBUTED_ANALYSIS_SCRIPT else: analysis_script = PARALLELL_ANALYSIS_SCRIPT # Launches the pipeline using PM module project_to_run, sample_to_run, flowcell_to_run = fc_dir.split('/')[-3:] cmd = ["pm", "production", "run", project_to_run, "--sample", sample_to_run, "--flowcell", flowcell_to_run, "--drmaa", "--force"] subprocess.check_call(cmd) # Change back to the starting directory os.chdir(start_dir)
def main(project_id, sample_names, single_end, config_file, Map_Stat, Read_Dist, FPKM, rRNA_table, GBC, stranded, strandness_table, complexity): if not sample_names: sample_names = commands.getoutput( "ls -d tophat_out_*|sed 's/tophat_out_//g'").split('\n') else: sample_names = sample_names.split(',') TEMPLATE = make_template(Map_Stat, FPKM, GBC, Read_Dist, rRNA_table, strandness_table, complexity) if config_file: config = load_config(config_file) else: config = {} projectfile = "%s.mako" % (project_id) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() tmpl = Template(filename=projectfile) proj_conf = {'id': project_id, 'config': config, 'samples': sample_names} d = generate_report(proj_conf, single_end, stranded) rstfile = "%s_analysis_report.rst" % (project_id) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() os.system('rst2pdf ' + rstfile) print """
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def run_analysis(work_dir, post_process, fc_dir, run_info): """Changes into the supplied work_dir directory and submits the job using the supplied arguments and with slurm parameters obtained from the post_process.yaml configuration """ # Move to the working directory start_dir = os.getcwd() os.chdir(work_dir) config = load_config(post_process) if str(config["algorithm"]["num_cores"]) == "messaging": analysis_script = DISTRIBUTED_ANALYSIS_SCRIPT else: analysis_script = PARALLELL_ANALYSIS_SCRIPT job_cl = [analysis_script, post_process, fc_dir, run_info] cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) platform_args = config["distributed"]["platform_args"].split() print "Submitting job" jobid = cluster.submit_job(platform_args, job_cl) print 'Your job has been submitted with id ' + jobid # Change back to the starting directory os.chdir(start_dir)
def main(config_file, month, year): config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) smonth, syear = (month - 1, year) if month > 1 else (12, year - 1) start_date = datetime(syear, smonth, 15, 0, 0, 0) # last day calculation useful if definition of month is # from first to last day instead of 15th-15th #(_, last_day) = calendar.monthrange(year, month) end_date = datetime(year, month, 14, 23, 59, 59) out_file = "%s_%s" % (start_date.strftime("%b"), end_date.strftime("%b-%Y-sequencing.csv")) with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow([ "Date", "Product", "Payment", "Researcher", "Lab", "Email", "Project", "Sample", "Description", "Genome", "Flowcell", "Lane", "Notes" ]) for s in galaxy_api.sqn_report(start_date.isoformat(), end_date.isoformat()): f_parts = s["sqn_run"]["run_folder"].split("_") flowcell = "_".join([f_parts[0], f_parts[-1]]) writer.writerow([ s["sqn_run"]["date"], s["sqn_type"], s["project"]["payment_(fund_number)"], s["project"]["researcher"], s["project"]["lab_association"], s["project"]["email"], s["project"]["project_name"], s["name"], s["description"], s["genome_build"], flowcell, s["sqn_run"]["lane"], s["sqn_run"]["results_notes"] ])
def analyze_and_upload(remote_info, config_file): """Main entry point for analysis and upload to Galaxy. """ config = load_config(config_file) fc_dir = _copy_from_sequencer(remote_info, config) analysis_dir = _run_analysis(fc_dir, remote_info, config, config_file) _upload_to_galaxy(fc_dir, analysis_dir, remote_info, config, config_file)
def backup_data(remote_info, config_file): """Main entry point for fetching data from sequencer or pre-processing machine. """ config = load_config(config_file) logger.info("Backing up run data over to remote storage: %s" % config["store_host"]) _copy_from_sequencer(remote_info, config)
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None): if project_desc is None and lanes is None: log.error("No project description or lanes provided: cannot deliver files without this information") sys.exit() config = load_config(config_file) ## Set log file in project output directory config.update(log_dir=os.path.join(project_dir, "log")) log_handler = create_log_handler(config, log.name) fc_dir = os.path.normpath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) with log_handler.applicationbound(): run_info = prune_run_info_by_description(run_info['details'], project_desc, lanes) if len(run_info) == 0: log.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc) sys.exit() dirs = dict(fc_dir=fc_dir, project_dir=project_dir) fc_name, fc_date = get_flowcell_id(run_info, dirs['fc_dir']) config.update(fc_name = fc_name, fc_date = fc_date) config.update(fc_alias = "%s_%s" % (fc_date, fc_name) if not fc_alias else fc_alias) dirs.update(fc_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, config['fc_alias'] )) dirs.update(data_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, "%s_%s" %(fc_date, fc_name) )) with log_handler.applicationbound(): config = _make_delivery_directory(dirs, config) _save_run_info(run_info, dirs['fc_delivery_dir'], run_exit=options.only_run_info) run_main(run_info, config, dirs)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def main(project_id,sample_names,single_end,config_file,Map_Stat,Read_Dist,FPKM,rRNA_table,GBC,stranded, strandness_table,complexity): if not sample_names: sample_names = commands.getoutput("ls -d tophat_out_*|sed 's/tophat_out_//g'").split('\n') else: sample_names = sample_names.split(',') TEMPLATE = make_template(Map_Stat, FPKM, GBC, Read_Dist, rRNA_table, strandness_table, complexity) if config_file: config = load_config(config_file) else: config = {} projectfile = "%s.mako" % (project_id) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() tmpl = Template(filename=projectfile) proj_conf = { 'id' : project_id, 'config' : config, 'samples': sample_names } d = generate_report(proj_conf,single_end,stranded) rstfile = "%s_analysis_report.rst" % (project_id) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() os.system('rst2pdf '+ rstfile) print """
def run_analysis(work_dir, post_process, fc_dir, run_info): """Changes into the supplied work_dir directory and submits the job using the supplied arguments and with slurm parameters obtained from the post_process.yaml configuration """ # Move to the working directory start_dir = os.getcwd() os.chdir(work_dir) config = load_config(post_process) if str(config["algorithm"]["num_cores"]) == "messaging": analysis_script = DISTRIBUTED_ANALYSIS_SCRIPT else: analysis_script = PARALLELL_ANALYSIS_SCRIPT # Launches the pipeline using PM module project_to_run, sample_to_run, flowcell_to_run = fc_dir.split('/')[-3:] cmd = [ "pm", "production", "run", project_to_run, "--sample", sample_to_run, "--flowcell", flowcell_to_run, "--drmaa", "--force" ] subprocess.check_call(cmd) # Change back to the starting directory os.chdir(start_dir)
def main(config_file, fc_dir=None, run_info_yaml=None, numcores=None, paralleltype=None, profile="default"): work_dir = os.getcwd() config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") paralleltype, numcores = _get_cores_and_type(config, fc_dir, run_info_yaml, numcores, paralleltype) parallel = { "type": paralleltype, "cores": numcores, "profile": profile, "module": "bcbio.distributed" } if parallel["type"] in ["local", "messaging-main"]: if numcores is None: config["algorithm"]["num_cores"] = numcores run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "messaging": parallel["task_module"] = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) messaging.run_and_monitor(config, config_file, args, parallel) elif parallel["type"] == "ipython": run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def setUp(self): self.file_dir = os.path.join(os.path.dirname(__file__)) self.fc_dir = os.path.join(self.file_dir, "110106_FC70BUKAAXX") self.proj_dir = os.path.join(self.file_dir, "projects", "j_doe_00_01") ##self.fcdir = os.path.join(os.path.dirname(__file__), "test_automated_output") self.run_info = os.path.join(self.fc_dir, "run_info.yaml") self.archive_base_dir = os.path.join(self.file_dir) self.analysis_base_dir = os.path.join(self.file_dir) # Remove fcdir if exists and setup new link init_flowcell_dir() if not os.path.exists(self.proj_dir): os.makedirs(self.proj_dir) if not os.path.exists(os.path.join(self.file_dir, "test_automated_output", "run_info.yaml")): os.symlink(os.path.join(self.file_dir, "data", "automated", "run_info-project.yaml"), os.path.join(self.file_dir, "test_automated_output", "run_info.yaml")) if not os.path.exists(os.path.join(self.file_dir, "test_automated_output", "tool-data")): os.symlink(os.path.join(self.file_dir, "data", "automated", "tool-data"), os.path.join(self.file_dir, "test_automated_output", "tool-data")) # Post_process.yaml post_process = load_config(os.path.join(self.file_dir, "data", "automated", "post_process.yaml")) post_process["analysis"]["store_dir"] = os.path.join(self.archive_base_dir) post_process["analysis"]["base_dir"] = os.path.join(self.analysis_base_dir) post_process["algorithm"]["snpcall"] = "true" post_process["algorithm"]["dbsnp"] = os.path.join("data", "genomes", "hg19", "variation", "dbsnp_132.vcf") with open(os.path.join(self.fc_dir, "post_process.yaml"), "w") as fh: yaml.dump(post_process, stream=fh)
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def main(config_file, fc_dir=None, run_info_yaml=None, numcores=None, paralleltype=None, profile="default"): work_dir = os.getcwd() config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") paralleltype, numcores = _get_cores_and_type(config, fc_dir, run_info_yaml, numcores, paralleltype) parallel = {"type": paralleltype, "cores": numcores, "profile": profile, "module": "bcbio.distributed"} if parallel["type"] in ["local", "messaging-main"]: if numcores is None: config["algorithm"]["num_cores"] = numcores run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "messaging": parallel["task_module"] = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) messaging.run_and_monitor(config, config_file, args, parallel) elif parallel["type"] == "ipython": run_main(config, config_file, work_dir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def main(args, mail, conffile, analysis, stranded, single, genome): project = args[0] ord_num = args[1] runs = args[2:] conf = cl.load_config(conffile) port = conf['statusdb']['port'] username = conf['statusdb']['username'] password = conf['statusdb']['password'] URL = username + ':' + password + '@' + conf['statusdb']['url'] extra_arg = "#SBATCH " + conf['sbatch']['extra_arg'] couch = couchdb.Server("http://" + URL + ':' + str(port)) proj_db = couch['projects'] key = find_proj_from_view(proj_db, project) info = proj_db[key] reference_genome = genome if genome else info['reference_genome'] RNA_analysis_settings = conf['custom_algorithms']['RNA-seq analysis'] refpath = RNA_analysis_settings[reference_genome]['genomepath'] gtfpath = RNA_analysis_settings[reference_genome]['gtfpath'] bedpath = RNA_analysis_settings[reference_genome]['bedpath'] today = str(datetime.today().isoformat()).replace( '-', '_').split('.')[0].replace(':', '_') command = [ os.environ['HOME'] + '/opt/scilifelab/scripts/RNA_analysis/RNA_analysis.sh', '-p', project, '-o', ord_num, '-b', bedpath, '-g', gtfpath, '-m', mail, '-c', conffile, '-e', '"' + extra_arg + '"', '-a', str(analysis), '-s', str(stranded), '-d', today, '-f', str(single), '-G', reference_genome ] + runs command = ' '.join(command) print command os.system(command)
def main(bam_file, config_file=None, chrom='all', start=0, end=None, outfile=None, normalize=False, use_tempfile=False): if config_file: config = load_config(config_file) else: config = {"program": {"ucsc_bigwig" : "wigToBigWig"}} if outfile is None: outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] if start > 0: start = int(start) - 1 if end is not None: end = int(end) regions = [(chrom, start, end)] if os.path.abspath(bam_file) == os.path.abspath(outfile): sys.stderr.write("Bad arguments, input and output files are the same.\n") sys.exit(1) if not (os.path.exists(outfile) and os.path.getsize(outfile) > 0): if use_tempfile: #Use a temp file to avoid any possiblity of not having write permission out_handle = tempfile.NamedTemporaryFile(delete=False) wig_file = out_handle.name else: wig_file = "%s.wig" % os.path.splitext(outfile)[0] out_handle = open(wig_file, "w") with closing(out_handle): chr_sizes, wig_valid = write_bam_track(bam_file, regions, config, out_handle, normalize) try: if wig_valid: convert_to_bigwig(wig_file, chr_sizes, config, outfile) finally: os.remove(wig_file)
def main(local_config, post_config_file=None, process_msg=True, store_msg=True, qseq=True, fastq=True): config = load_config(local_config) log_handler = create_log_handler(config) with log_handler.applicationbound(): search_for_new(config, local_config, post_config_file, process_msg, store_msg, qseq, fastq)
def main(galaxy_config, processing_config): amqp_config = read_galaxy_amqp_config(galaxy_config) config = load_config(processing_config) store_tag = config["msg_store_tag"] log_handler = create_log_handler(config, LOG_NAME) handlers = [(store_tag, store_handler(config, store_tag))] with log_handler.applicationbound(): message_reader(handlers, amqp_config)
def test_1_notify(self): config = load_config(self.config_file) if not "email" in config: print "No email configured, skipping test!" return log_handler = self._get_log_handler(config) result = self._log_messages(log_handler, "Pipeline notification test email @ %s" % time.strftime("%x - %X")) assert result is None, "%s" % result
def main(local_config, post_config_file=None, fetch_msg=True, process_msg=True, store_msg=True, backup_msg=False, qseq=True, fastq=True, remove_qseq=False, compress_fastq=False, casava=False): config = load_config(local_config) log_handler = create_log_handler(config, True) with log_handler.applicationbound(): search_for_new(config, local_config, post_config_file, fetch_msg, \ process_msg, store_msg, backup_msg, qseq, fastq, remove_qseq, compress_fastq, casava)
def analyze(remote_info, config_file): """Starts analysis of data that have been pushed to analysis location """ config = load_config(config_file) fc_dir = os.path.join(remote_info["store_dir"], os.path.basename(remote_info["directory"])) analysis_dir = _run_analysis(fc_dir, remote_info, config, config_file) _upload_to_galaxy(fc_dir, analysis_dir, remote_info, config, config_file)
def main(config_file, queues=None): task_module = "bcbio.distributed.tasks" config = load_config(config_file) with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def main(local_config, post_config_file=None, process_msg=True, store_msg=True, qseq=True, fastq=True): config = load_config(local_config) log_handler = create_log_handler(config, LOG_NAME) with log_handler.applicationbound(): search_for_new(config, local_config, post_config_file, process_msg, store_msg, qseq, fastq)
def long_term_storage(remote_info, config_file): config = load_config(config_file) log_handler = create_log_handler(config, log.name) with log_handler.applicationbound(): log.info("Copying run data over to remote storage: %s" % config["store_host"]) log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def load_couch_server(config_file): """loads couch server with settings specified in 'config_file'""" try: db_conf = cl.load_config(config_file)['statusdb'] url = db_conf['username']+':'+db_conf['password']+'@'+db_conf['url']+':'+str(db_conf['port']) couch = couchdb.Server("http://" + url) return couch except: return None
def main(*args, **kwargs): local_config = args[0] post_process_config = args[1] if len(args) > 1 else None kwargs["post_process_config"] = post_process_config config = load_config(local_config) log_handler = create_log_handler(config, True) with log_handler.threadbound(): search_for_new(config, local_config, **kwargs)
def main(config_file, fc_dir, run_info_yaml=None, num_cores=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") if num_cores: config["algorithm"]["num_cores"] = int(num_cores) setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) if config.get("qcdb", None) is None: sys.exit() else: qcdb_config = config.get("qcdb", {}) analysis = config.get("analysis", {}) setup_logging(config) qcdb_store_dir = qcdb_config.get("qcdb_store_dir", None) run_main(fc_dir, qcdb_store_dir)
def setUp(self): self.data_dir = os.path.join(os.path.dirname(__file__), "data", "automated") config_file = os.path.join(self.data_dir, "post_process-statusdb.yaml") config = load_config(config_file) setup_logging(config) fc_date = "110106" fc_name = "FC70BUKAAXX" run_info_yaml = os.path.join(self.data_dir, "run_info.yaml") workdir = os.path.join(os.path.dirname(__file__), "110106_FC70BUKAAXX") fc_dir = os.path.join(self.data_dir, os.pardir, "110106_FC70BUKAAXX")
def test_1_notify(self): config = load_config(self.config_file) if not "email" in config: print "No email configured, skipping test!" return log_handler = self._get_log_handler(config) result = self._log_messages( log_handler, "Pipeline notification test email @ %s" % time.strftime("%x - %X")) assert result is None, "%s" % result
def analyze_and_upload(remote_info, config_file): """Main entry point for analysis and upload to Galaxy. """ config = load_config(config_file) log_handler = create_log_handler(config, log.name) with log_handler.applicationbound(): fc_dir = _copy_from_sequencer(remote_info, config) analysis_dir = _run_analysis(fc_dir, remote_info, config, config_file) _upload_to_galaxy(fc_dir, analysis_dir, remote_info, config, config_file)
def main(args, phred64, fai, projtag, mail, hours, conffile, fpath, single, stranded): proj_ID = args[0] flow_cell = args[1] if phred64 == True: qscale = "--solexa1.3-quals" else: qscale = "" if not len(hours.split(":")) == 3: sys.exit("Please specify the time allocation string as hours:minutes:seconds or days-hours:minutes:seconds") conf = cl.load_config(conffile) port = conf["statusdb"]["port"] username = conf["statusdb"]["username"] password = conf["statusdb"]["password"] URL = username + ":" + password + "@" + conf["statusdb"]["url"] extra_arg = "#SBATCH " + conf["sbatch"]["extra_arg"] couch = couchdb.Server("http://" + URL + ":" + str(port)) proj_db = couch["projects"] key = find_proj_from_view(proj_db, proj_ID) try: info = proj_db[key] except: sys.exit("project " + proj_ID + " not found in statusdb") reference_genome = info["reference_genome"] RNA_analysis_settings = conf["custom_algorithms"]["RNA-seq analysis"] refpath = RNA_analysis_settings[reference_genome]["genomepath"] aligner_version = RNA_analysis_settings["aligner_version"] if stranded is True: aligner_libtype = RNA_analysis_settings["aligner_libtype"] else: aligner_libtype = "" p = os.getcwd() if not fpath: fpath = p.split("intermediate")[0] + "data/" + flow_cell file_info = get_names_from_fastqfiles(fpath, flow_cell) for lane in file_info: an_path = prepare_lane_run_dir(p, lane) for samp in sorted(file_info[lane]): innerdist, innnerdistflagg, R1, R2 = frag_len_from_couch(fpath, file_info[lane][samp], single, samp, info) Generat_sbatch_file( an_path, hours, samp, mail, aligner_version, innerdist, refpath, innnerdistflagg, R1, R2, extra_arg, aligner_libtype, fai, qscale, )
def main(config_file, in_file, space, start, end): config = load_config(config_file) runner = broad.runner_from_config(config) target_region = (space, int(start), int(end)) for pair in [1, 2]: out_file = "%s_%s-%s.fastq" % (os.path.splitext(os.path.basename(in_file))[0], pair, target_region[0]) with open(out_file, "w") as out_handle: for name, seq, qual in bam_to_fastq_pair(in_file, target_region, pair): out_handle.write("@%s/%s\n%s\n+\n%s\n" % (name, pair, seq, qual)) sort_fastq(out_file, runner)
def test_2_report_notification(self): config = load_config(self.config_file) if not "gdocs_upload" in config or not "gdocs_email_notification" in config["gdocs_upload"]: print "Google docs email reporting not configured, skipping test!" return config["email"] = config["gdocs_upload"]["gdocs_email_notification"] log_handler = self._get_log_handler(config) result = self._log_messages(log_handler, "Google Docs report notification test email @ %s" % time.strftime("%x - %X")) assert result is None, "%s" % result
def long_term_storage(remote_info, config_file): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ config = load_config(config_file) logger.info("Copying run data over to remote storage: %s" % config["store_host"]) logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def main(args, phred64, fai, projtag, mail, hours, conffile, fpath, single, stranded, genome, inner, adapter): proj_ID = args[0] flow_cell = args[1] if phred64 == True: qscale = '--solexa1.3-quals' else: qscale = '' if not len(hours.split(':')) == 3: sys.exit( "Please specify the time allocation string as hours:minutes:seconds or days-hours:minutes:seconds" ) conf = cl.load_config(conffile) port = conf['statusdb']['port'] username = conf['statusdb']['username'] password = conf['statusdb']['password'] URL = username + ':' + password + '@' + conf['statusdb']['url'] extra_arg = "#SBATCH " + conf['sbatch']['extra_arg'] couch = couchdb.Server("http://" + URL + ':' + str(port)) proj_db = couch['projects'] key = find_proj_from_view(proj_db, proj_ID) try: info = proj_db[key] except: sys.exit("project " + proj_ID + " not found in statusdb") reference_genome = genome if genome else info['reference_genome'] RNA_analysis_settings = conf['custom_algorithms']['RNA-seq analysis'] refpath = RNA_analysis_settings[reference_genome]['genomepath'] aligner_version = RNA_analysis_settings['aligner_version'] if stranded is True: aligner_libtype = RNA_analysis_settings['aligner_libtype'] else: aligner_libtype = '' p = os.getcwd() if not fpath: fpath = p.split('intermediate')[0] + 'data/' + flow_cell file_info = get_names_from_fastqfiles(fpath, flow_cell) for lane in file_info: an_path = prepare_lane_run_dir(p, lane) for samp in sorted(file_info[lane]): try: innerdist, innnerdistflagg, R1, R2 = frag_len_from_couch( fpath, file_info[lane][samp], single, samp, info, inner, adapter) Generat_sbatch_file(an_path, hours, samp, mail, aligner_version, innerdist, refpath, innnerdistflagg, R1, R2, extra_arg, aligner_libtype, fai, qscale) except: print "{}\n[Error Occured] No sbatch script generated!".format( "-" * 30)
def main(config_file, in_file, space, start, end): config = load_config(config_file) runner = broad.runner_from_config(config) target_region = (space, int(start), int(end)) for pair in [1, 2]: out_file = "%s_%s-%s.fastq" % (os.path.splitext( os.path.basename(in_file))[0], pair, target_region[0]) with open(out_file, "w") as out_handle: for name, seq, qual in bam_to_fastq_pair(in_file, target_region, pair): out_handle.write("@%s/%s\n%s\n+\n%s\n" % (name, pair, seq, qual)) sort_fastq(out_file, runner)
def test_create_bc_report(self): """Create a demultiplex report and upload it to gdocs """ # Parse the config config_file = os.path.join(self.data_dir, "post_process.yaml") self.config = load_config(config_file) # Loop over the runs for name in self.runname: print "\nProcessing %s" % name fc_name, fc_date = get_flowcell_info(name) analysisdir = os.path.join(self.workdir, name) assert create_report_on_gdocs(fc_date, fc_name, self.run_info_file, {"work": analysisdir, "flowcell": analysisdir}, self.config), "Report creation failed"
def test_create_bc_report(self): """Create a demultiplex report and upload it to gdocs """ # Parse the config config_file = os.path.join(self.data_dir, "post_process.yaml") self.config = load_config(config_file) # Loop over the runs for name in self.runname: print "\nProcessing %s" % name fc_name, fc_date = get_flowcell_info(name) analysisdir = os.path.join(self.workdir, name) create_bc_report_on_gdocs(fc_date, fc_name, analysisdir, {'details': self.run_info}, self.config)
def main(args, mail, conffile, analysis, stranded, single, genome): project = args[0] ord_num = args[1] runs = args[2:] conf = cl.load_config(conffile) port = conf["statusdb"]["port"] username = conf["statusdb"]["username"] password = conf["statusdb"]["password"] URL = username + ":" + password + "@" + conf["statusdb"]["url"] extra_arg = "#SBATCH " + conf["sbatch"]["extra_arg"] couch = couchdb.Server("http://" + URL + ":" + str(port)) proj_db = couch["projects"] key = find_proj_from_view(proj_db, project) info = proj_db[key] reference_genome = genome if genome else info["reference_genome"] RNA_analysis_settings = conf["custom_algorithms"]["RNA-seq analysis"] refpath = RNA_analysis_settings[reference_genome]["genomepath"] gtfpath = RNA_analysis_settings[reference_genome]["gtfpath"] bedpath = RNA_analysis_settings[reference_genome]["bedpath"] today = str(datetime.today().isoformat()).replace("-", "_").split(".")[0].replace(":", "_") command = [ os.environ["HOME"] + "/opt/scilifelab/scripts/RNA_analysis/RNA_analysis.sh", "-p", project, "-o", ord_num, "-b", bedpath, "-g", gtfpath, "-m", mail, "-c", conffile, "-e", '"' + extra_arg + '"', "-a", str(analysis), "-s", str(stranded), "-d", today, "-f", str(single), "-G", reference_genome, ] + runs command = " ".join(command) print command os.system(command)
def test_2_report_notification(self): config = load_config(self.config_file) if not "gdocs_upload" in config or not "gdocs_email_notification" in config[ "gdocs_upload"]: print "Google docs email reporting not configured, skipping test!" return config["email"] = config["gdocs_upload"]["gdocs_email_notification"] log_handler = self._get_log_handler(config) result = self._log_messages( log_handler, "Google Docs report notification test email @ %s" % time.strftime("%x - %X")) assert result is None, "%s" % result
def analyze_locally(dname, post_config_file, fastq_dir): """Run analysis directly on the local machine. """ assert fastq_dir is not None post_config = load_config(post_config_file) analysis_dir = os.path.join(fastq_dir, os.pardir, "analysis") utils.safe_makedir(analysis_dir) with utils.chdir(analysis_dir): prog = "bcbio_nextgen.py" cl = [prog, post_config_file, dname] run_yaml = os.path.join(dname, "run_info.yaml") if os.path.exists(run_yaml): cl.append(run_yaml) subprocess.check_call(cl)
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") def insert_command(record): record.extra["command"] = sys.argv record.extra["version"] = version.get_pipeline_version() setup_logging(config) handler = create_log_handler(config) with handler, \ logbook.Processor(insert_command): run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def analyze_locally(dname, post_config_file, fastq_dir): """Run analysis directly on the local machine. """ assert fastq_dir is not None post_config = load_config(post_config_file) analysis_dir = os.path.join(fastq_dir, os.pardir, "analysis") utils.safe_makedir(analysis_dir) with utils.chdir(analysis_dir): if post_config["algorithm"]["num_cores"] == "messaging": prog = post_config["analysis"]["distributed_process_program"] else: prog = post_config["analysis"]["process_program"] cl = [prog, post_config_file, dname] run_yaml = os.path.join(dname, "run_info.yaml") if os.path.exists(run_yaml): cl.append(run_yaml) subprocess.check_call(cl)
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format(queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def test_variable_expansion(): """Test expanding the environment variables in the test yaml. """ config = load_config("data/loading_test/variables.yaml") try: for variable, value in config.items(): assert (os.environ[variable] == value ), "The strings %s and %s doesn't match (variable %s)" % ( os.environ[variable], value, variable) # When the key isn't in os.environ except KeyError as e: for variable, value in config[e.args[0]].items(): assert (os.environ[variable] == value ), "The strings %s and %s doesn't match (variable %s)" % ( os.environ[variable], value, variable)
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None): config = load_config(config_file) assert config["algorithm"]["num_cores"] == "messaging", \ "Use this script only with configured 'messaging' parallelization" if num_workers is None: if config["distributed"].get("num_workers", "") == "all": cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) num_workers = cluster.available_nodes( config["distributed"]["platform_args"]) - 1 if num_workers is None: num_workers = _needed_workers( get_run_info(fc_dir, config, run_info_yaml)[-1]) task_module = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) run_and_monitor(config, config_file, args, num_workers, task_module)
def _generate_metrics(bam_fname, config_file, ref_file, bait_file, target_file): """Run Picard commands to generate metrics files when missing. """ config = load_config(config_file) broad_runner = broad.runner_from_config(config) bam_fname = os.path.abspath(bam_fname) path = os.path.dirname(bam_fname) out_dir = os.path.join(path, "metrics") utils.safe_makedir(out_dir) with utils.chdir(out_dir): with utils.curdir_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, _bam_is_paired(bam_fname), bait_file, target_file) return out_dir