def get_bases_mask(run_info_xml, sample_sheet_file=None): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads), and optionally updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: (optional) path to sample sheet file Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask if sample_sheet_file is not None: # Update bases mask from sample sheet example_barcode = IlluminaData.samplesheet_index_sequence( IlluminaData.SampleSheet(sample_sheet_file).data[0]) if example_barcode is None: example_barcode = "" if barcode_is_10xgenomics(example_barcode): print "Bases mask: barcode is 10xGenomics sample set ID" else: bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % \ (bases_mask,example_barcode) return bases_mask
def setup(self): # Make output filenames report_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.report') xls_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.xls') html_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.html') # Remove existing copies, if found for filen in (report_file, xls_file, html_file): if os.path.exists(filen): os.remove(filen) # Build command to run the barcode analysis cmd = PipelineCommandWrapper( "Run analyse_barcodes.py to report barcodes", 'analyse_barcodes.py', '--report', report_file, '--xls', xls_file, '--html', html_file) if self.args.sample_sheet: cmd.add_args('--sample-sheet', self.args.sample_sheet) if self.args.lanes: lanes = self.args.lanes elif self.args.sample_sheet: # Implicitly get lanes from sample sheet try: lanes = sorted( set([ line['Lane'] for line in IlluminaData.SampleSheet( self.args.sample_sheet).data ])) except KeyError: # No lanes lanes = None else: lanes = None if lanes: cmd.add_args('--lanes', ','.join([str(l) for l in lanes])) if self.args.cutoff: cmd.add_args('--cutoff', self.args.cutoff) if self.args.mismatches: cmd.add_args('--mismatches', self.args.mismatches) if self.args.title: cmd.add_args('--title', self.args.title) cmd.add_args('-c') cmd.add_args(*self.args.counts_files) self.add_cmd(cmd) # Update the output parameters self.output.report_file.set(report_file) self.output.xls_file.set(xls_file) self.output.html_file.set(html_file)
def check_barcode_collisions(sample_sheet_file, nmismatches): """ Check sample sheet for barcode collisions Check barcode index sequences within each lane (or across all samples, if no lane information is present) and find any which differ in fewer bases than a threshold number which is calculated as: less than 2 times the number of mismatches plus 1 (as is stated in the output from bcl2fastq v2.) Pairs of barcodes which are too similar (i.e. which collide) are reported as a list of tuples, e.g. [('ATTCCT','ATTCCG'),...] Arguments: sample_sheet_file (str): path to a SampleSheet.csv file to analyse for barcode collisions nmismatches (int): maximum number of mismatches to allow Returns: List: list of pairs of colliding barcodes (with each pair wrapped in a tuple), or an empty list if no collisions were detected. """ # Load the sample sheet data sample_sheet = IlluminaData.SampleSheet(sample_sheet_file) # List of index sequences (barcodes) barcodes = {} has_lanes = sample_sheet.has_lanes for line in sample_sheet: # Lane if has_lanes: lane = line['Lane'] else: lane = 1 # Index sequence try: # Try dual-indexed IEM4 format indx = "%s%s" % (line['index'].strip(), line['index2'].strip()) except KeyError: # Try single indexed IEM4 (no index2) try: indx = line['index'].strip() except KeyError: # Try CASAVA format try: indx = line['Index'].strip() except KeyError: # No index columns indx = "" # Explicitly set empty index to None if not indx: indx = None try: barcodes[lane].append(indx) except KeyError: barcodes[lane] = [ indx, ] # Mismatch threshold mismatch_threshold = 2 * nmismatches + 1 # Check for collisions collisions = [] for lane in barcodes: for i, seq1 in enumerate(barcodes[lane][:-1]): for seq2 in barcodes[lane][i + 1:]: ndiff = 0 for c1, c2 in zip(seq1, seq2): if c1 != c2: ndiff += 1 if ndiff < mismatch_threshold: collisions.append((seq1, seq2)) return collisions
def make_custom_sample_sheet(input_sample_sheet, output_sample_sheet=None, lanes=None, fmt=None): """ Creates a corrected copy of a sample sheet file Creates and returns a SampleSheet object with a copy of the input sample sheet, with any illegal or duplicated names fixed. Optionally it can also: write the updated sample sheet data to a new file, switch the format, and include only a subset of lanes from the original file Arguments: input_sample_sheet (str): name and path of the original sample sheet file output_sample_sheet (str): (optional) name and path to write updated sample sheet to, or `None` lanes (list): (optional) list of lane numbers to keep in the output sample sheet; if `None` then all lanes will be kept (the default), otherwise lanes will be dropped if they don't appear in the supplied list fmt (str): (optional) format for the output sample sheet, either 'CASAVA' or 'IEM'; if this is `None` then the format of the original file will be used Returns: SampleSheet object with the data for the corrected sample sheet. """ # Load the sample sheet data sample_sheet = IlluminaData.SampleSheet(input_sample_sheet) # Determine the column names for this format if sample_sheet.format == 'CASAVA': sample_col = 'SampleID' project_col = 'SampleProject' elif sample_sheet.format == 'IEM': sample_col = 'Sample_ID' project_col = 'Sample_Project' else: raise Exception("Unknown sample sheet format: %s" % sample_sheet.format) # Add project names if not supplied for line in sample_sheet: if not line[project_col]: line[project_col] = line[sample_col] # Fix other problems sample_sheet.fix_illegal_names() sample_sheet.fix_duplicated_names() # Select subset of lanes if requested if lanes is not None: logging.debug("Updating to include only specified lanes: %s" % ','.join([str(l) for l in lanes])) i = 0 while i < len(sample_sheet): line = sample_sheet[i] if line['Lane'] in lanes: logging.debug("Keeping %s" % line) i += 1 else: del (sample_sheet[i]) # Write out new sample sheet if output_sample_sheet is not None: sample_sheet.write(output_sample_sheet, fmt=fmt) return sample_sheet
"IEM sample sheet to older format)") p.add_argument('sample_sheet',metavar="SAMPLE_SHEET", help="input sample sheet file") # Process command line args = p.parse_args() if args.miseq: logging.warning("--miseq option no longer necessary; " "MiSEQ-style sample sheets are now converted " "automatically") # Get input sample sheet file samplesheet = args.sample_sheet if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the sample sheet data = IlluminaData.SampleSheet(samplesheet) if data.format is None: logging.error("Unable to determine samplesheet format") sys.exit(1) print "Sample sheet format: %s" % data.format # Remove lanes if args.lanes is not None: if not data.has_lanes: logging.error("sample sheet doesn't define any lanes") sys.exit(1) lanes = parse_lanes(args.lanes) print "Keeping lanes %s, removing the rest" % \ ','.join([str(x) for x in lanes]) i = 0 while i < len(data): line = data[i]
def check_barcode_collisions(sample_sheet_file, nmismatches, use_index='all'): """ Check sample sheet for barcode collisions Check barcode index sequences within each lane (or across all samples, if no lane information is present) and find any which differ in fewer bases than a threshold number which is calculated as: less than 2 times the number of mismatches plus 1 (as is stated in the output from bcl2fastq v2.) Pairs of barcodes which are too similar (i.e. which collide) are reported as a list of tuples, e.g. [('ATTCCT','ATTCCG'),...] Arguments: sample_sheet_file (str): path to a SampleSheet.csv file to analyse for barcode collisions nmismatches (int): maximum number of mismatches to allow use_index (str): flag indicating how to treat index sequences: 'all' (the default) combines indexes into a single sequence before checking for collisions, '1' only checks index 1 (i7), and '2' only checks index 2 (i5) Returns: List: list of pairs of colliding barcodes (with each pair wrapped in a tuple), or an empty list if no collisions were detected. """ # Load the sample sheet data sample_sheet = IlluminaData.SampleSheet(sample_sheet_file) # Convert index flag to string use_index = str(use_index) # List of index sequences (barcodes) barcodes = {} has_lanes = sample_sheet.has_lanes for line in sample_sheet: # Lane if has_lanes: lane = line['Lane'] else: lane = 1 # Extract i7 index sequence indx_i7 = None try: # IEM4 format indx_i7 = line['index'].strip() except KeyError: # CASAVA format try: indx_i7 = line['Index'].strip() except KeyError: pass # Extract i5 index sequence indx_i5 = None try: # IEM4 format indx_i5 = line['index2'].strip() except KeyError: # No i5 for CASAVA pass # Assemble index sequence to check for mismatches if use_index == "all": # Combine i5 and i7 into a single sequence indx = "%s%s" % (indx_i7 if indx_i7 else '', indx_i5 if indx_i5 else '') elif use_index == "1": # Only use i7 indx = indx_i7 elif use_index == "2": # Only use i5 indx = indx_i5 else: # Undefined index type raise Exception("Unrecognised index: '%s'" % use_index) # Explicitly set empty index to None if not indx: indx = None try: barcodes[lane].append(indx) except KeyError: barcodes[lane] = [ indx, ] # Mismatch threshold mismatch_threshold = 2 * nmismatches + 1 # Check for collisions collisions = [] for lane in barcodes: for i, seq1 in enumerate(barcodes[lane][:-1]): for seq2 in barcodes[lane][i + 1:]: ndiff = 0 for c1, c2 in zip(seq1, seq2): if c1 != c2: ndiff += 1 if ndiff < mismatch_threshold: collisions.append((seq1, seq2)) return collisions
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, lanes=None, ignore_missing_bcl=False, ignore_missing_stats=False, skip_rsync=False, remove_primary_data=False, nprocessors=None, require_bcl2fastq_version=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, skip_fastq_generation=False, only_fetch_primary_data=False, create_empty_fastqs=None, runner=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False): """Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) nprocessors (int) : number of processors to run bclToFastq.py with ignore_missing_bcl (bool): if True then run bcl2fastq with --ignore-missing-bcl ignore_missing_stats (bool): if True then run bcl2fastq with --ignore-missing-stats skip_rsync (bool): if True then don't rsync primary data at the start of bcl2fastq conversion remove_primary_data (bool): if True then remove primary data at the end of bcl2fastq conversion (default is to keep it) generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs require_bcl2fastq_version (str): (optional) specify bcl2fastq version to use. Should be a string of the form '1.8.4' or '>2.0'. Set to None to automatically determine required bcl2fastq version. bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis skip_fastq_generation (bool): if True then don't perform fastq generation only_fetch_primary_data (bool): if True then fetch primary data, don't do anything else create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) """ # Report protocol print "Protocol : %s" % protocol if protocol not in MAKE_FASTQS_PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS]))) # Unaligned dir if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print "Output dir : %s" % ap.params.unaligned_dir # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print "Source sample sheet : %s" % ap.params.sample_sheet # Check requested lanes are actually present print "Lanes : %s" % ('all' if lanes is None else ','.join( [str(l) for l in lanes])) if lanes is not None: s = IlluminaData.SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Make a temporary sample sheet if lanes: lanes_id = ".L%s" % ''.join([str(l) for l in lanes]) else: lanes_id = "" sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S"))) make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes) # Check the temporary sample sheet print "Checking temporary sample sheet" invalid_barcodes = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_barcodes() if invalid_barcodes: logger.error("Invalid barcodes detected") for line in invalid_barcodes: logger.critical("%s" % line) invalid_characters = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_characters() if invalid_characters: logger.critical("Invalid non-printing/non-ASCII characters " "detected") if invalid_barcodes or invalid_characters: raise Exception("Errors detected in generated sample sheet") # Adjust verification settings for 10xGenomics Chromium SC # data if necessary verify_include_sample_dir = False if has_chromium_sc_indices(sample_sheet): if protocol in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): # Force inclusion of sample-name subdirectories # when verifying Chromium SC data print "Sample sheet includes Chromium SC indices" verify_include_sample_dir = True else: # Chromium SC indices detected but not using # 10x_chromium_sc protocol raise Exception("Detected 10xGenomics Chromium SC indices " "in generated sample sheet but protocol " "'%s' has been specified; use an " "appropriate '10x_...' protocol for these " "indices" % protocol) # Check for pre-existing Fastq outputs if verify_fastq_generation(ap, unaligned_dir=ap.params.unaligned_dir, lanes=lanes, include_sample_dir=verify_include_sample_dir): print "Expected Fastq outputs already present" skip_rsync = True skip_fastq_generation = True # Check if there's anything to do if (skip_rsync and skip_fastq_generation) and \ not (generate_stats or analyse_barcodes): print "Nothing to do" return # Log dir log_dir = 'make_fastqs' if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Fetch primary data if not skip_rsync and not ap.params.acquired_primary_data: if get_primary_data(ap) != 0: logger.error("Failed to acquire primary data") raise Exception("Failed to acquire primary data") else: ap.params['acquired_primary_data'] = True if only_fetch_primary_data: return # Deal with platform information if not platform: platform = ap.metadata.platform # Do fastq generation using the specified protocol if not skip_fastq_generation: # Set primary data location and report info primary_data_dir = os.path.join(ap.params.primary_data_dir, os.path.basename(ap.params.data_dir)) print "Primary data dir : %s" % primary_data_dir try: illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=platform) except IlluminaData.IlluminaDataPlatformError as ex: logger.critical("Error loading primary data: %s" % ex) if platform is None: logger.critical("Try specifying platform using --platform?") else: logger.critical("Check specified platform is valid (or " "omit --platform") raise Exception("Error determining sequencer platform") print "Platform : %s" % illumina_run.platform print "Bcl format : %s" % illumina_run.bcl_extension # Set platform in metadata ap.metadata['platform'] = illumina_run.platform # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask print "Bases mask setting : %s" % bases_mask if protocol not in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): if bases_mask == "auto": print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask(illumina_run.runinfo_xml, sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Do fastq generation according to protocol if protocol == 'icell8': # ICell8 data # Update bcl2fastq settings appropriately print "Updating read trimming and masking for ICell8" minimum_trimmed_read_length = 21 mask_short_adapter_reads = 0 # Reset the default bases mask bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask bases_mask = get_icell8_bases_mask(bases_mask, sample_sheet=sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Switch to standard protocol protocol = 'standard' if protocol == 'standard': # Standard protocol try: exit_code = bcl_to_fastq( ap, unaligned_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, require_bcl2fastq=require_bcl2fastq_version, bases_mask=bases_mask, ignore_missing_bcl=ignore_missing_bcl, ignore_missing_stats=ignore_missing_stats, no_lane_splitting=no_lane_splitting, minimum_trimmed_read_length=minimum_trimmed_read_length, mask_short_adapter_reads=mask_short_adapter_reads, nprocessors=nprocessors, runner=runner) except Exception as ex: raise Exception("Bcl2fastq stage failed: '%s'" % ex) elif protocol == '10x_chromium_sc': # 10xGenomics Chromium SC if bases_mask == 'auto': bases_mask = None try: # Check we have cellranger cellranger = find_program('cellranger') if not cellranger: raise Exception("No cellranger package found") cellranger_software_info = cellranger_info(cellranger) print "Using cellranger %s: %s" % \ (cellranger_software_info[-1], cellranger) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_software_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, ap.log_dir) # Determine output directory absolute path output_dir = ap.params.unaligned_dir if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Run cellranger mkfastq exit_code = run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join( [str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=ap.analysis_dir, log_dir=ap.log_dir) except Exception as ex: raise Exception("'cellranger mkfastq' stage failed: " "'%s'" % ex) # Turn off barcode analysis analyse_barcodes = False elif protocol == '10x_chromium_sc_atac': # 10xGenomics Chromium scATAC-seq exit_code = bcl_to_fastq_10x_chromium_sc_atac( ap, output_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, lanes=lanes, bases_mask=bases_mask, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, log_dir=ap.log_dir) # Turn off barcode analysis analyse_barcodes = False else: # Unknown protocol raise Exception("Unknown protocol '%s'" % protocol) # Check the outputs if exit_code != 0: raise Exception("Fastq generation finished with error: " "exit code %d" % exit_code) if not verify_fastq_generation( ap, lanes=lanes, include_sample_dir=verify_include_sample_dir): # Check failed logger.error("Failed to verify output Fastqs against " "sample sheet") # Try to load the data from unaligned dir try: illumina_data = IlluminaData.IlluminaData( ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir) except IlluminaData.IlluminaDataError as ex: raise Exception("Unable to load data from %s: %s" % (ap.params.unaligned_dir, ex)) # Generate a list of missing Fastqs missing_fastqs = IlluminaData.list_missing_fastqs( illumina_data, sample_sheet, include_sample_dir=verify_include_sample_dir) assert (len(missing_fastqs) > 0) missing_fastqs_file = os.path.join(ap.log_dir, "missing_fastqs.log") print "Writing list of missing Fastq files to %s" % \ missing_fastqs_file with open(missing_fastqs_file, 'w') as fp: for fq in missing_fastqs: fp.write("%s\n" % fq) # Create empty FASTQs if create_empty_fastqs is None: try: create_empty_fastqs = \ ap.settings.platform[ap.metadata.platform].\ create_empty_fastqs except (KeyError, AttributeError): pass if create_empty_fastqs is None: create_empty_fastqs = \ ap.settings.bcl2fastq.create_empty_fastqs if create_empty_fastqs: logger.warning("Making 'empty' placeholder Fastqs") for fq in missing_fastqs: fastq = os.path.join(ap.analysis_dir, ap.params.unaligned_dir, fq) print "-- %s" % fastq if not os.path.exists(os.path.dirname(fastq)): mkdirs(os.path.dirname(fastq)) with gzip.GzipFile(filename=fastq, mode='wb') as fp: fp.write('') else: raise Exception("Fastq generation failed to produce " "expected outputs") # Generate statistics if generate_stats: fastq_statistics(ap, stats_file=stats_file, per_lane_stats_file=per_lane_stats_file, unaligned_dir=ap.params.unaligned_dir, nprocessors=nprocessors, runner=runner) # Run barcode analysis if analyse_barcodes: # Determine output directory if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Report title title = "Barcode analysis for %s" % ap.metadata.run_name # Log file log_file = os.path.join(ap.log_dir, "analyse_barcodes.log") # Set up runner if runner is None: runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) # Get scheduler parameters max_jobs = ap.settings.general.max_concurrent_jobs poll_interval = ap.settings.general.poll_interval # Create and run barcode analysis pipeline barcode_analysis = AnalyseBarcodes( os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir)) barcode_analysis.run(barcode_analysis_dir, title=title, lanes=lanes, sample_sheet=sample_sheet, log_file=log_file, runner=runner, max_jobs=max_jobs, poll_interval=poll_interval, verbose=False) # Make a 'projects.info' metadata file if lanes: ap.update_project_metadata_file() else: ap.make_project_metadata_file() # Remove primary data if remove_primary_data: remove_primary_data(ap)