def _generate_fastq_with_casava(fc_dir, config, r1=False): """Perform demultiplexing and generate fastq.gz files for the current flowecell using CASAVA (>1.8). """ basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls") casava_dir = config["program"].get("casava") unaligned_dir = os.path.join(fc_dir, "Unaligned") samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config) num_mismatches = config["algorithm"].get("mismatches", 1) num_cores = config["algorithm"].get("num_cores", 1) im_stats = config["algorithm"].get("ignore-missing-stats",False) im_bcl = config["algorithm"].get("ignore-missing-bcl",False) im_control = config["algorithm"].get("ignore-missing-control",False) # Write to log files configure_out = os.path.join(fc_dir,"configureBclToFastq.out") configure_err = os.path.join(fc_dir,"configureBclToFastq.err") casava_out = os.path.join(fc_dir,"bclToFastq_R{:d}.out".format(2-int(r1))) casava_err = os.path.join(fc_dir,"bclToFastq_R{:d}.err".format(2-int(r1))) cl = [os.path.join(casava_dir, "configureBclToFastq.pl")] cl.extend(["--input-dir", basecall_dir]) cl.extend(["--output-dir", unaligned_dir]) cl.extend(["--mismatches", str(num_mismatches)]) cl.extend(["--fastq-cluster-count", "0"]) if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file]) if im_stats: cl.append("--ignore-missing-stats") if im_bcl: cl.append("--ignore-missing-bcl") if im_control: cl.append("--ignore-missing-control") bm = _get_bases_mask(fc_dir) if bm is not None: cl.extend(["--use-bases-mask", bm]) if r1: # Run configuration script logger2.info("Configuring BCL to Fastq conversion") logger2.debug(cl) co = open(configure_out,'w') ce = open(configure_err,'w') try: subprocess.check_call(cl,stdout=co,stderr=ce) co.close() ce.close() except subprocess.CalledProcessError, e: logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, {:s}".format(fc_dir, str(e.returncode), configure_out, configure_err)) raise e
def _log_messages(self, log_handler, subject="Test email"): try: with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__('run', subject)): logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X")) logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X")) logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X")) logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X")) logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X")) logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X")) except Exception as e: return e return None
def simple_upload(remote_info, data): """Upload generated files to specified host using rsync """ include = ['--include=*/'] for fcopy in data['to_copy']: include.extend(["--include", "{}**/*".format(fcopy)]) include.append("--include={}".format(fcopy)) # By including both these patterns we get the entire directory # if a directory is given, or a single file if a single file is # given. cl = ["rsync", \ "--checksum", \ "--recursive", \ "--links", \ "-D", \ "--partial", \ "--progress", \ "--prune-empty-dirs" ] # file / dir inclusion specification cl.extend(include) cl.append("--exclude=*") # source and target cl.extend([ # source data["directory"], \ # target "{store_user}@{store_host}:{store_dir}".format(**remote_info) ]) logdir = remote_info.get("log_dir",os.getcwd()) rsync_out = os.path.join(logdir,"rsync_transfer.out") rsync_err = os.path.join(logdir,"rsync_transfer.err") ro = open(rsync_out, 'a') re = open(rsync_err, 'a') try: ro.write("-----------\n{}\n".format(" ".join(cl))) re.write("-----------\n{}\n".format(" ".join(cl))) ro.flush() re.flush() subprocess.check_call(cl, stdout=ro, stderr=re) except subprocess.CalledProcessError, e: logger2.error("rsync transfer of {} FAILED with (exit code {}). " \ "Please check log files {:s} and {:s}".format(data["directory"], str(e.returncode), rsync_out, rsync_err)) raise e
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append, keys=[]): """Generic method to write a set of rows to a worksheet on google docs. """ # Convert the worksheet title to unicode wsheet_title = _to_unicode(wsheet_title) # Add a new worksheet, possibly appending or replacing a pre-existing # worksheet according to the append-flag. wsheet = g_spreadsheet.add_worksheet(client, \ ssheet, \ wsheet_title, \ len(rows) + 1, \ len(header), \ append) if wsheet is None: logger2.error("ERROR: Could not add a worksheet {!r} to " \ "spreadsheet {!r}".format(wsheet_title, ssheet.title.text)) return False # If keys are specified (will correspond to indexes in the header), delete pre-existing rows with matching keys if append and len(keys) > 0: wsheet_data = g_spreadsheet.get_cell_content(client, ssheet, wsheet, '2') wsheet_header = g_spreadsheet.get_header(client, ssheet, wsheet) try: wsheet_indexes = [wsheet_header.index(key) for key in keys] header_indexes = [header.index(key) for key in keys] except ValueError: logger2.warn("WARNING: Could not identify correct header for duplicate detection") else: for row in rows: try: key = "#".join([row[i] for i in header_indexes]) for i, wrow in enumerate(wsheet_data): wkey = "#".join([wrow[j] for j in wsheet_indexes]) if wkey == key: g_spreadsheet.delete_row(client, ssheet, wsheet, i+1) wsheet_data.pop(i) break except: logger2.warn("WARNING: Could not identify/replace duplicate rows") # Write the data to the worksheet success = g_spreadsheet.write_rows(client, ssheet, wsheet, header, rows) if success: logger2.info("Wrote data to the {!r}:{!r} " \ "worksheet".format(ssheet.title.text, wsheet_title)) else: logger2.error("ERROR: Could not write data to the {!r}:{!r} " \ "worksheet".format(ssheet.title.text, wsheet_title)) return success
def snpeff_effects(vcf_in, genome, config): """Prepare tab-delimited file for variant effects using snpEff. """ interval_file = config["algorithm"].get("hybrid_target", None) if _vcf_has_items(vcf_in) and config["algorithm"].get( "variation_effects", True): se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in) if interval_file else None) try: snpeff_data_dir = os.path.join(config["program"]["snpEff"], "data") snpeff_genome_remap = config.get("resources", {}).get( "snpEff", {}).get("genome_remap", SNPEFF_GENOME_REMAP) assert genome in snpeff_genome_remap, log.error( "The genome {} is not present in the SnpEff genome dictionary." ) for snpeff_genome in snpeff_genome_remap[genome]: if os.path.exists(os.path.join(snpeff_data_dir, snpeff_genome)): break vcf_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "vcf", config) effects_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "txt", config) finally: for fname in [se_interval]: if fname and os.path.exists(fname): os.remove(fname) return vcf_file, effects_file else: return None, None
def process_second_read(*args, **kwargs): """Processing to be performed after all reads have been sequenced """ dname, config = args[0:2] logger2.info("The instrument has finished dumping on directory %s" % dname) utils.touch_indicator_file(os.path.join(dname, "second_read_processing_started.txt")) _update_reported(config["msg_db"], dname) fastq_dir = None # Do bcl -> fastq conversion and demultiplexing using Casava1.8+ if kwargs.get("casava", False): if not kwargs.get("no_casava_processing", False): logger2.info("Generating fastq.gz files for {:s}".format(dname)) _generate_fastq_with_casava(dname, config) # Merge demultiplexing results into a single Unaligned folder utils.merge_demux_results(dname) #Move the demultiplexing results if config.has_key('mfs_dir'): fc_id = os.path.basename(dname) cl = ["rsync", \ "--checksum", \ "--recursive", \ "--links", \ "-D", \ "--partial", \ "--progress", \ "--prune-empty-dirs", \ os.path.join(dname, 'Unaligned'), \ os.path.join(config.get('mfs_dir'), fc_id) ] logger2.info("Synching Unaligned folder to MooseFS for run {}".format(fc_id)) logdir = os.path.join(config.get('log_dir'), os.getcwd()) rsync_out = os.path.join(logdir,"rsync_transfer.out") rsync_err = os.path.join(logdir,"rsync_transfer.err") with open(rsync_out, 'a') as ro: with open(rsync_err, 'a') as re: try: ro.write("-----------\n{}\n".format(" ".join(cl))) re.write("-----------\n{}\n".format(" ".join(cl))) subprocess.check_call(cl, stdout=ro, stderr=re) except subprocess.CalledProcessError, e: logger2.error("rsync transfer of Unaligned results FAILED")
def initial_processing(*args, **kwargs): """Initial processing to be performed after the first base report """ dname, config = args[0:2] # Touch the indicator flag that processing of read1 has been started utils.touch_indicator_file(os.path.join(dname, "initial_processing_started.txt")) # Copy the samplesheet to the run folder ss_file = samplesheet.run_has_samplesheet(dname, config) if ss_file: dst = os.path.join(dname,os.path.basename(ss_file)) try: copyfile(ss_file,dst) except IOError, e: logger2.error("Error copying samplesheet {} from {} to {}: {}" \ "".format(os.path.basename(ss_file), os.path.dirname(ss_file), os.path.dirname(dst), e))
def _log_messages(self, log_handler, subject="Test email"): try: with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__( 'run', subject)): logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X")) logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X")) logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X")) logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X")) logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X")) logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X")) except Exception as e: return e return None
def snpeff_effects(vcf_in, genome, config): """Prepare tab-delimited file for variant effects using snpEff. """ interval_file = config["algorithm"].get("hybrid_target", None) if _vcf_has_items(vcf_in) and config["algorithm"].get("variation_effects",True): se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in) if interval_file else None) try: snpeff_data_dir = os.path.join(config["program"]["snpEff"], "data") snpeff_genome_remap = config.get("resources",{}).get("snpEff",{}).get("genome_remap",SNPEFF_GENOME_REMAP) assert genome in snpeff_genome_remap, log.error("The genome {} is not present in the SnpEff genome dictionary.") for snpeff_genome in snpeff_genome_remap[genome]: if os.path.exists(os.path.join(snpeff_data_dir, snpeff_genome)): break vcf_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "vcf", config) effects_file = _run_snpeff(vcf_in, snpeff_genome, se_interval, "txt", config) finally: for fname in [se_interval]: if fname and os.path.exists(fname): os.remove(fname) return vcf_file, effects_file else: return None, None
if r1: cl.append("r1") logger2.info("Demultiplexing and converting bcl to fastq.gz") logger2.debug(cl) co = open(casava_out,'w') ce = open(casava_err,'w') try: subprocess.check_call(cl,stdout=co,stderr=ce) co.close() ce.close() except subprocess.CalledProcessError, e: logger2.error("BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, "\ "{:s}".format(fc_dir, str(e.returncode), casava_out, casava_err)) raise e logger2.debug("Done") return unaligned_dir def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "")
def _generate_fastq_with_casava_task(args): """Perform demultiplexing and generate fastq.gz files for the current flowecell using CASAVA (>1.8). """ bp = args.get('bp') samples_group = args.get('samples') base_mask = samples_group['base_mask'] samples = samples_group['samples'] fc_dir = args.get('fc_dir') config = args.get('config') r1 = args.get('r1', False) idx_only = args.get('idx_only', False) ss = 'SampleSheet_{bp}bp.csv'.format(bp=str(bp)) unaligned_folder = 'Unaligned_{bp}bp'.format(bp=str(bp)) out_file = 'configureBclToFastq_{bp}bp.out'.format(bp=str(bp)) err_file = 'configureBclToFastq_{bp}bp.err'.format(bp=str(bp)) #Prepare CL arguments and call configureBclToFastq basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls") casava_dir = config["program"].get("casava") out_dir = config.get("out_directory", fc_dir) #Append the flowcell dir to the output directory if different from the run dir if out_dir != fc_dir: out_dir = os.path.join(out_dir, os.path.basename(fc_dir)) unaligned_dir = os.path.join(out_dir, unaligned_folder) samplesheet_file = os.path.join(fc_dir, ss) num_mismatches = config["algorithm"].get("mismatches", 1) num_cores = config["algorithm"].get("num_cores", 1) im_stats = config["algorithm"].get("ignore-missing-stats", False) im_bcl = config["algorithm"].get("ignore-missing-bcl", False) im_control = config["algorithm"].get("ignore-missing-control", False) # Write to log files configure_out = os.path.join(fc_dir, out_file) configure_err = os.path.join(fc_dir, err_file) casava_out = os.path.join(fc_dir, "bclToFastq_R{:d}.out".format(2 - int(r1))) casava_err = os.path.join(fc_dir, "bclToFastq_R{:d}.err".format(2 - int(r1))) cl = [os.path.join(casava_dir, "configureBclToFastq.pl")] cl.extend(["--input-dir", basecall_dir]) cl.extend(["--output-dir", unaligned_dir]) cl.extend(["--mismatches", str(num_mismatches)]) cl.extend(["--fastq-cluster-count", "0"]) if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file]) if im_stats: cl.append("--ignore-missing-stats") if im_bcl: cl.append("--ignore-missing-bcl") if im_control: cl.append("--ignore-missing-control") if base_mask is not None: cl.extend(["--use-bases-mask", ','.join(base_mask)]) if r1: cl.append("--force") if r1 or idx_only: #Create separate samplesheet and folder with open(os.path.join(fc_dir, ss), 'w') as fh: samplesheet = csv.DictWriter(fh, fieldnames=samples['fieldnames'], dialect='excel') samplesheet.writeheader() samplesheet.writerows(samples['samples']) # Run configuration script logger2.info("Configuring BCL to Fastq conversion") logger2.debug(cl) co = open(configure_out, 'w') ce = open(configure_err, 'w') try: co.write("{}\n".format(" ".join(cl))) ce.write("{}\n".format(" ".join(cl))) subprocess.check_call(cl, stdout=co, stderr=ce) except subprocess.CalledProcessError, e: logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, {:s}".format(fc_dir, str(e.returncode), configure_out, configure_err)) raise e finally:
except IOError, e: logger2.error("Error copying samplesheet {} from {} to {}: {}" \ "".format(os.path.basename(ss_file), os.path.dirname(ss_file), os.path.dirname(dst), e)) # If this is a MiSeq run and we have the scilifelab modules loaded, # convert the MiSeq samplesheet into a format compatible with casava elif _is_miseq_run(dname): if 'scilifelab.illumina.miseq' in sys.modules: mrun = MiSeqRun(dname) hiseq_ssheet = os.path.join(dname,'{}.csv'.format(_get_flowcell_id(dname))) mrun.write_hiseq_samplesheet(hiseq_ssheet) # If the module wasn't loaded, there's nothing we can do, so warn else: logger2.error("The necessary dependencies for processing MiSeq runs with CASAVA could not be loaded") # Upload the necessary files loc_args = args + (None, ) _post_process_run(*loc_args, **{"fetch_msg": kwargs.get("fetch_msg", False), "process_msg": False, "store_msg": kwargs.get("store_msg", False), "backup_msg": kwargs.get("backup_msg", False), "push_data": kwargs.get("push_data", False)}) # Touch the indicator flag that processing of read1 has been completed utils.touch_indicator_file(os.path.join(dname, "initial_processing_completed.txt")) def process_first_read(*args, **kwargs): """Processing to be performed after the first read and the index reads