def search_for_new(config, amqp_config, process_msg, store_msg, qseq, fastq): """Search for any new directories that have not been reported. """ reported = _read_reported(config["msg_db"]) for dname in _get_directories(config): if os.path.isdir(dname) and dname not in reported: if _is_finished_dumping(dname): log.info( "The instrument has finished dumping on directory %s" % dname) _update_reported(config["msg_db"], dname) ss_file = samplesheet.run_has_samplesheet(dname, config) if ss_file: out_file = os.path.join(dname, "run_info.yaml") log.info("CSV Samplesheet %s found, converting to %s" % (ss_file, out_file)) samplesheet.csv2yaml(ss_file, out_file) if qseq: log.info("Generating qseq files for %s" % dname) _generate_qseq(get_qseq_dir(dname), config) if fastq: log.info("Generating fastq files for %s" % dname) _generate_fastq(dname, config) store_files, process_files = _files_to_copy(dname) if process_msg: finished_message(config["msg_process_tag"], dname, process_files, amqp_config) if store_msg: finished_message(config["msg_store_tag"], dname, store_files, amqp_config)
def search_for_new(config, amqp_config, process_msg, store_msg, qseq, fastq): """Search for any new directories that have not been reported. """ reported = _read_reported(config["msg_db"]) for dname in _get_directories(config): if os.path.isdir(dname) and dname not in reported: if _is_finished_dumping(dname): log.info("The instrument has finished dumping on directory %s" % dname) _update_reported(config["msg_db"], dname) ss_file = samplesheet.run_has_samplesheet(dname, config) if ss_file: out_file = os.path.join(dname, "run_info.yaml") log.info("CSV Samplesheet %s found, converting to %s" % (ss_file, out_file)) samplesheet.csv2yaml(ss_file, out_file) if qseq: log.info("Generating qseq files for %s" % dname) _generate_qseq(get_qseq_dir(dname), config) if fastq: log.info("Generating fastq files for %s" % dname) _generate_fastq(dname, config) store_files, process_files = _files_to_copy(dname) if process_msg: finished_message(config["msg_process_tag"], dname, process_files, amqp_config) if store_msg: finished_message(config["msg_store_tag"], dname, store_files, amqp_config)
def _process_samplesheets(dname, config): """Process Illumina samplesheets into YAML files for post-processing. """ ss_file = samplesheet.run_has_samplesheet(dname, config) if ss_file: out_file = os.path.join(dname, "run_info.yaml") log.info("CSV Samplesheet %s found, converting to %s" % (ss_file, out_file)) samplesheet.csv2yaml(ss_file, out_file)
def _process_samplesheets(dname, config): """Process Illumina samplesheets into YAML files for post-processing. """ ss_file = samplesheet.run_has_samplesheet(dname, config) if ss_file: out_file = os.path.join(dname, "run_info.yaml") logger2.info("CSV Samplesheet %s found, converting to %s" % (ss_file, out_file)) samplesheet.csv2yaml(ss_file, out_file)
def toyaml(self, ssheet): self.out_file = samplesheet.csv2yaml(ssheet) assert os.path.exists(self.out_file) with open(self.out_file) as in_handle: info = yaml.load(in_handle) in_handle.close() return info
def test_toyaml(self): """Convert CSV Illumina SampleSheet to YAML. """ out_file = samplesheet.csv2yaml(self.ss_file) assert os.path.exists(out_file) with open(out_file) as in_handle: info = yaml.load(in_handle) assert info[0]['lane'] == '1' assert info[0]['multiplex'][0]['barcode_id'] == 5 os.remove(out_file)
#!/usr/bin/env python """Convert Illumina SampleSheet CSV files to the run_info.yaml input file. This allows running the analysis pipeline without Galaxy, using CSV input files from Illumina SampleSheet or Genesifter. Usage: convert_samplesheet_config.py <input csv> """ import sys from bcbio.solexa import samplesheet if __name__ == "__main__": samplesheet.csv2yaml(sys.argv[1])
def main(run_id, config_file, run_info_file=None, dryrun=False): assert run_id, \ "No run id was specified" assert os.path.exists(config_file), \ "The configuration file, {}, could not be found".format(config_file) config = load_config(config_file) assert "gdocs_upload" in config, \ "The configuration file, {}, has no section specifying the Google docs details".format(config_file) analysis_cfg = config.get("analysis", {}) if "store_dir" in analysis_cfg: archive_dir = os.path.join(analysis_cfg["store_dir"], run_id) else: archive_dir = os.getcwd() analysis_dir = None if "base_dir" in analysis_cfg: analysis_dir = os.path.join(analysis_cfg["base_dir"], run_id) if analysis_dir is None or not os.path.exists(analysis_dir): analysis_dir = tempfile.mkdtemp() dirs = { "work": os.path.normpath(analysis_dir), "flowcell": os.path.normpath(archive_dir) } assert os.path.exists(dirs["flowcell"]), \ "The flowcell directory, {}, could not be found".format(dirs["flowcell"]) assert os.path.exists(dirs["work"]), \ "The work directory, {}, could not be found".format(dirs["work"]) if run_info_file is None: run_info_file = os.path.join(dirs["flowcell"], "run_info.yaml") if not os.path.exists(run_info_file): # Locate the samplesheet and convert to yaml samplesheet = _find_samplesheet(dirs["flowcell"]) assert samplesheet, \ "Could not locate samplesheet in {}, aborting..".format(dirs["flowcell"]) fh, run_info_file = tempfile.mkstemp() os.close(fh) run_info_file = ssheet.csv2yaml(samplesheet, run_info_file) assert os.path.exists(run_info_file), \ "The run info configuration file, {}, could not be found".format(run_info_file) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) # If we have no bc_metrics files in the workdir, we may be looking at a Casava run. # In that case, attempt to parse the Demultiplex_Stats.htm file and create bc_metrics files metric_files = glob.glob( os.path.join(dirs["work"], "*_barcode", "*bc[_.]metrics")) + glob.glob( os.path.join(dirs["work"], "*bc[_.]metrics")) if len(metric_files) == 0: casava_report = _find_casava_report(dirs["flowcell"]) assert len(casava_report) > 0, \ "Could not locate CASAVA demultiplex report in {}, aborting..".format(dirs["flowcell"]) metric_files = _casava_report_to_metrics(run_info_file, casava_report, dirs) assert len(metric_files) > 0, \ "Could not locate or create required metric files, aborting.." print( "A report will be created on Google Docs based on the demultiplexed data in {}" .format(dirs["work"])) print("The configuration file is {0} and the run info file is {1}".format( config_file, run_info_file)) print("The run was started on {0} and has flowcell id {1}".format( fc_date, fc_name)) if not dryrun: create_report_on_gdocs(fc_date, fc_name, run_info_file, dirs, config) else: print("DRY-RUN: nothing uploaded")
def main(run_id, config_file, run_info_file=None, dryrun=False): assert run_id, \ "No run id was specified" assert os.path.exists(config_file), \ "The configuration file, {}, could not be found".format(config_file) config = load_config(config_file) assert "gdocs_upload" in config, \ "The configuration file, {}, has no section specifying the Google docs details".format(config_file) analysis_cfg = config.get("analysis",{}) if "store_dir" in analysis_cfg: archive_dir = os.path.join(analysis_cfg["store_dir"], run_id) else: archive_dir = os.getcwd() analysis_dir = None if "base_dir" in analysis_cfg: analysis_dir = os.path.join(analysis_cfg["base_dir"], run_id) if analysis_dir is None or not os.path.exists(analysis_dir): analysis_dir = tempfile.mkdtemp() dirs = {"work": os.path.normpath(analysis_dir), "flowcell": os.path.normpath(archive_dir)} assert os.path.exists(dirs["flowcell"]), \ "The flowcell directory, {}, could not be found".format(dirs["flowcell"]) assert os.path.exists(dirs["work"]), \ "The work directory, {}, could not be found".format(dirs["work"]) if run_info_file is None: run_info_file = os.path.join(dirs["flowcell"], "run_info.yaml") if not os.path.exists(run_info_file): # Locate the samplesheet and convert to yaml samplesheet = _find_samplesheet(dirs["flowcell"]) assert samplesheet, \ "Could not locate samplesheet in {}, aborting..".format(dirs["flowcell"]) fh, run_info_file = tempfile.mkstemp() os.close(fh) run_info_file = ssheet.csv2yaml(samplesheet,run_info_file) assert os.path.exists(run_info_file), \ "The run info configuration file, {}, could not be found".format(run_info_file) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) # If we have no bc_metrics files in the workdir, we may be looking at a Casava run. # In that case, attempt to parse the Demultiplex_Stats.htm file and create bc_metrics files metric_files = glob.glob(os.path.join(dirs["work"], "*_barcode", "*bc[_.]metrics")) + glob.glob(os.path.join(dirs["work"], "*bc[_.]metrics")) if len(metric_files) == 0: casava_report = _find_casava_report(dirs["flowcell"]) assert len(casava_report) > 0, \ "Could not locate CASAVA demultiplex report in {}, aborting..".format(dirs["flowcell"]) metric_files = _casava_report_to_metrics(run_info_file, casava_report, dirs) assert len(metric_files) > 0, \ "Could not locate or create required metric files, aborting.." print("A report will be created on Google Docs based on the demultiplexed data in {}".format(dirs["work"])) print("The configuration file is {0} and the run info file is {1}".format(config_file, run_info_file)) print("The run was started on {0} and has flowcell id {1}".format(fc_date, fc_name)) if not dryrun: create_report_on_gdocs(fc_date, fc_name, run_info_file, dirs, config) else: print("DRY-RUN: nothing uploaded")