def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet)
def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet)
def has_chromium_sc_indices(sample_sheet): """ Check if a sample sheet contains Chromium SC indices The Chromium SC indices can be obtained from: https://support.10xgenomics.com/permalink/27rGqWvNYYuqkgeS66sksm The Chromium SC 3'v2 indices are of the form: SI-GA-[A-H][1-12] e.g. 'SI-GA-B11' Arguments: sample_sheet (str): path to the sample sheet CSV file to check Returns: Boolean: True if the sample sheet contains at least one Chromium SC index, False if not. """ index_pattern = re.compile(r"SI-GA-[A-H](1[1-2]|[1-9])$") s = SampleSheet(sample_sheet) for line in s: if index_pattern.match(line['index']): return True return False
def __init__(self, sample_sheet_file): """ Create a new SampleSheetBarcodes instance Arguments: sample_sheet_file (str): path of a SampleSheet.csv file """ self._sample_sheet = SampleSheet(sample_sheet_file) self._sample_lookup = {} self._barcode_lookup = {} self._lanes = [] sample_id = self._sample_sheet.sample_id_column for line in self._sample_sheet.data: if self._sample_sheet.has_lanes: lane = line['Lane'] else: lane = None if lane not in self._lanes: self._lanes.append(lane) self._sample_lookup[lane] = {} self._barcode_lookup[lane] = {} sample = line[sample_id] index_seq = samplesheet_index_sequence(line) if index_seq is not None: index_seq = index_seq.replace('-', '+') else: index_seq = "" self._sample_lookup[lane][index_seq] = sample self._barcode_lookup[lane][sample] = index_seq
def _check_sample_sheet_indexes(self,sample_sheet_file): """ Check that empty indexes are correctly specified in samplesheet """ # Split sample sheet into sub-sheets by lane sample_sheet = SampleSheet(sample_sheet_file) if sample_sheet.has_lanes: lanes = list(set([line['Lane'] for line in sample_sheet])) sample_sheet = [make_custom_sample_sheet(sample_sheet_file, lanes=(i,)) for i in lanes] else: sample_sheet = [sample_sheet] # Check for empty indexes in each lane for s in sample_sheet: for line in s: if not samplesheet_index_sequence(line): # Lane contains an empty index # Only valid if this is the only line if len(s.data) > 1: if s.has_lanes: raise Exception("Invalid sample sheet: " "empty index must be the " "only line for this lane") else: raise Exception("Invalid sample sheet: " "empty index must be the " "only line")
def __init__(self, fmt='IEM', has_lanes=False, dual_index=True, quote_values=False, pad=False): """ Create a new MockSampleSheet instance Arguments: fmt (str): either 'IEM' or 'CASAVA' has_lanes (boolean): if True then the output sample sheet will include a 'Lane' field dual_index (boolean): if True then IEM-style sample sheet will have dual index fields (not relevant for CASAVA-style) quote_values (boolean): if True then output data values will be surrounded by double quotes (default is not to quote values) pad (boolean): if True then output sample sheet will have additional commas on each line (simulates output from Excel) (default is not to pad output) """ # Store argument values self._format = fmt self._has_lanes = has_lanes self._dual_index = dual_index # Output formatting self.quote_values = quote_values self.pad = pad # Instantiate the base object SampleSheet.__init__(self, fp=cStringIO.StringIO(self._template())) # Initialise additional sections for IEM if self._format == 'IEM': self.set_header(IEMFileVersion=4, Date="11/23/2015", Workflow="GenerateFASTQ", Application="FASTQ Only", Assay="TruSeq HT", Description="", Chemistry="Amplicon") self.set_reads(101, 101) self.set_settings(ReverseComplement=0, Adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", AdapterRead2="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA")
def __init__(self,fmt='IEM',has_lanes=False,dual_index=True, quote_values=False,pad=False): """ Create a new MockSampleSheet instance Arguments: fmt (str): either 'IEM' or 'CASAVA' has_lanes (boolean): if True then the output sample sheet will include a 'Lane' field dual_index (boolean): if True then IEM-style sample sheet will have dual index fields (not relevant for CASAVA-style) quote_values (boolean): if True then output data values will be surrounded by double quotes (default is not to quote values) pad (boolean): if True then output sample sheet will have additional commas on each line (simulates output from Excel) (default is not to pad output) """ # Store argument values self._format = fmt self._has_lanes = has_lanes self._dual_index = dual_index # Output formatting self.quote_values = quote_values self.pad = pad # Instantiate the base object SampleSheet.__init__(self,fp=cStringIO.StringIO(self._template())) # Initialise additional sections for IEM if self._format == 'IEM': self.set_header(IEMFileVersion=4, Date="11/23/2015", Workflow="GenerateFASTQ", Application="FASTQ Only", Assay="TruSeq HT", Description="", Chemistry="Amplicon") self.set_reads(101,101) self.set_settings(ReverseComplement=0, Adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", AdapterRead2="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA")
def setup(self): # Make output filenames report_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.report') xls_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.xls') html_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.html') # Remove existing copies, if found for filen in (report_file,xls_file,html_file): if os.path.exists(filen): os.remove(filen) # Build command to run the barcode analysis cmd = PipelineCommandWrapper( "Run analyse_barcodes.py to report barcodes", 'analyse_barcodes.py', '--report',report_file, '--xls',xls_file, '--html',html_file) if self.args.sample_sheet: cmd.add_args('--sample-sheet',self.args.sample_sheet) if self.args.lanes: lanes = self.args.lanes elif self.args.sample_sheet: # Implicitly get lanes from sample sheet try: lanes = sorted( set([line['Lane'] for line in SampleSheet(self.args.sample_sheet)])) except KeyError: # No lanes lanes = None else: lanes = None if lanes: cmd.add_args('--lanes', ','.join([str(l) for l in lanes])) if self.args.cutoff: cmd.add_args('--cutoff',self.args.cutoff) if self.args.mismatches: cmd.add_args('--mismatches',self.args.mismatches) if self.args.title: cmd.add_args('--title',self.args.title) cmd.add_args('-c') cmd.add_args(*self.args.counts_files) self.add_cmd(cmd) # Update the output parameters self.output.report_file.set(report_file) self.output.xls_file.set(xls_file) self.output.html_file.set(html_file)
def show(self,fmt=None): """ Construct and return sample sheet contents """ output = SampleSheet.show(self,fmt=fmt) if self.pad: ncols = len(self.data.header()) padded_output = [] for line in output.split('\n'): ncols_in_line = len(line.split(',')) if ncols_in_line < ncols: line = line + ','*(ncols-ncols_in_line-1) padded_output.append(line) output = '\n'.join(padded_output) return output
def show(self, fmt=None): """ Construct and return sample sheet contents """ output = SampleSheet.show(self, fmt=fmt) if self.pad: ncols = len(self.data.header()) padded_output = [] for line in output.split('\n'): ncols_in_line = len(line.split(',')) if ncols_in_line < ncols: line = line + ',' * (ncols - ncols_in_line - 1) padded_output.append(line) output = '\n'.join(padded_output) return output
def has_10x_indices(sample_sheet): """ Check if a sample sheet contains 10xGenomics-format indices The Chromium SC 3'v2 indices are of the form: SI-GA-[A-H][1-12] e.g. 'SI-GA-B11' (see https://support.10xgenomics.com/permalink/27rGqWvNYYuqkgeS66sksm) For scATAC-seq the indices are assumed to be of the form: SI-NA-[A-H][1-12] e.g. 'SI-NA-G9' For Visium data the indices are assumed to be of the form: SI-(TT|TS)-[A-H][1-12] e.g. 'SI-TT-B1' Arguments: sample_sheet (str): path to the sample sheet CSV file to check Returns: Boolean: True if the sample sheet contains at least one 10xGenomics-style index, False if not. """ index_pattern = re.compile(r"SI-(GA|NA|TT|TS)-[A-H](1[0-2]|[1-9])$") s = SampleSheet(sample_sheet) for line in s: try: if index_pattern.match(line['index']): return True except KeyError: pass return False
def get_bases_mask_icell8(bases_mask,sample_sheet=None): """ Reset the supplied bases mask string so that only the bases containing the inline barcode and UMIs are kept, and any remaining bases are ignored. If a sample sheet is also supplied then an additional update will be made to ensure that the bases mask respects the barcode lengths given there. Arguments: bases_mask (str): initial bases mask string to update sample_sheet (str): path to optional sample sheet Returns: String: updated bases mask string """ # Extract R1 mask bases_mask = bases_mask.split(',') r1_mask = bases_mask[0] # Update to restrict to 21 bases num_cycles = int(r1_mask[1:]) icell8_inline_length = (INLINE_BARCODE_LENGTH + UMI_LENGTH) assert(num_cycles >= icell8_inline_length) discard_length = (num_cycles - icell8_inline_length) r1_mask = "y%d" % icell8_inline_length r1_mask += ("n%d" % discard_length if discard_length > 0 else "") bases_mask[0] = r1_mask # Rebuild full bases mask bases_mask = ','.join(bases_mask) # Handle sample sheet if sample_sheet is not None: index_seq = samplesheet_index_sequence( SampleSheet(sample_sheet).data[0]) if index_seq is None: index_seq = "" bases_mask = fix_bases_mask(bases_mask,index_seq) return bases_mask
def __init__(self,bcl2fastq_dir=None,sample_sheet=None): """ Create a new AnalyseBarcodes pipeline instance At least one of the bcl2fastq output directory or sample sheet must be supplied when the pipeline is instantiated. If the bcl2fastq output directory is supplied on initialisation then it must exist and already contain output Fastq files. It is possible to set the pipeline up before the bcl2fastq outputs have been generated, as long as the sample sheet is supplied. The bcl2fastq output directory must then be supplied as an input when the pipeline is executed via the 'run' method. Arguments: bcl2fastq_dir (str): path to the directory with outputs from bcl2fastq sample_sheet (str): path to the sample sheet file """ # Initialise the pipeline superclass Pipeline.__init__(self,name="Analyse Barcodes") # Internal parameters self._bcl2fastq_dir = bcl2fastq_dir self._sample_sheet = sample_sheet # Define parameters self.add_param('bcl2fastq_dir',value=self._bcl2fastq_dir,type=str) self.add_param('sample_sheet',value=self._sample_sheet,type=str) self.add_param('barcode_analysis_dir',type=str) self.add_param('counts_dir',type=str) self.add_param('title',type=str) self.add_param('lanes',type=list) self.add_param('bases_mask',type=str) self.add_param('mismatches',type=int) self.add_param('cutoff',type=float,value=0.001) self.add_param('force',type=bool,value=False) # Get a list of projects if self._bcl2fastq_dir is not None: # Load data from bcl2fastq output try: analysis_dir = os.path.abspath( os.path.dirname(self._bcl2fastq_dir)) bcl2fastq_dir = os.path.basename(self._bcl2fastq_dir) illumina_data = IlluminaData(analysis_dir, unaligned_dir=bcl2fastq_dir) except Exception as ex: raise Exception("Unaligned dir '%s' supplied but can't " "load data" % self._bcl2fastq_dir) # Get a list of projects projects = [p.name for p in illumina_data.projects] elif self._sample_sheet is not None: # Load data from sample sheet try: s = SampleSheet(self._sample_sheet) # List of unique project names projects = list(set( [d[s.sample_project_column] if d[s.sample_project_column] else d[s.sample_id_column] for d in s])) except Exception as ex: raise Exception("Sample sheet '%s' supplied but can't " "get a list of project names" % self._sample_sheet) # Check any empty barcode sequences self._check_sample_sheet_indexes(self._sample_sheet) else: raise Exception("Need to supply either unaligned (bcl2fastq " "output) dir or sample sheet") self.report("Expecting projects:") for p in projects: self.report("- %s" % p) #################### # Build the pipeline #################### # Setup barcode analysis and counts directories setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs( "Setup barcode analysis directory", self.params.barcode_analysis_dir, self.params.counts_dir, force=self.params.force) self.add_task(setup_barcode_analysis_dir) # Load the data from the unaligned/bcl2fastq output dir load_illumina_data = LoadIlluminaData( "Load Fastq data for barcode analysis", self.params.bcl2fastq_dir) self.add_task(load_illumina_data) # Generate counts for each project count_tasks = [] for project in projects: count_barcodes = CountBarcodes( "Count barcodes in '%s'" % project, load_illumina_data.output.illumina_data, project, self.params.counts_dir, lanes=self.params.lanes) self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, load_illumina_data)) count_tasks.append(count_barcodes) # Get counts for 'undetermined' count_barcodes = CountBarcodes( "Count barcodes in 'undetermined'", load_illumina_data.output.illumina_data, "__undetermined__", self.params.counts_dir, lanes=self.params.lanes, use_project_name="undetermined") self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, load_illumina_data)) count_tasks.append(count_barcodes) # List the counts files list_counts_files = ListBarcodeCountFiles( "Gather the barcode counts files", self.params.counts_dir) self.add_task(list_counts_files, requires=count_tasks) # Analyse counts and report the results report_barcodes = ReportBarcodeAnalysis( "Report barcode analysis", list_counts_files.output.counts_files, self.params.barcode_analysis_dir, sample_sheet=self.params.sample_sheet, lanes=self.params.lanes, mismatches=self.params.mismatches, cutoff=self.params.cutoff, title=self.params.title ) self.add_task(report_barcodes, requires=(list_counts_files,)) # Add final outputs to the pipeline self.add_output('report_file',report_barcodes.output.report_file) self.add_output('xls_file',report_barcodes.output.xls_file) self.add_output('html_file',report_barcodes.output.html_file)
def report_per_lane_sample_stats(self,out_file=None,fp=None, samplesheet=None): """ Report of reads per sample in each lane Reports the number of reads for each sample in each lane plus the total reads for each lane. Example output: Lane 1 Total reads = 182851745 - KatyDobbs/KD-K1 79888058 43.7% - KatyDobbs/KD-K3 97854292 53.5% - Undetermined_indices/lane1 5109395 2.8% ... Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) samplesheet (str): optional sample sheet file to get additional data from """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file,'w') else: fpp = fp # Get data from samplesheet expected_samples = {} if samplesheet: s = SampleSheet(samplesheet) ncol = s.sample_id_column pcol = s.sample_project_column for data in s: if s.has_lanes: lanes = ['L%d' % data['Lane']] else: lanes = self.lane_names sample = { 'Project': data[pcol], 'Sample': data[ncol], } for lane in lanes: try: expected_samples[lane].append(sample) except KeyError: expected_samples[lane] = [sample,] # Report lanes = self.lane_names for lane in lanes: lane_number = int(lane[1:]) samples = filter(lambda x: x['Read_number'] == 1 and not IlluminaFastq(x['Fastq']).is_index_read and bool(x[lane]), self._stats) # Additional samples from samplesheet if lane in expected_samples: for sample in expected_samples[lane]: found_sample = False for smpl in samples: if smpl['Sample'] == sample['Sample'] and \ smpl['Project'] == sample['Project']: found_sample = True break if not found_sample: # Add the expected sample with zero reads # for the lane being examined samples.append( TabDataLine( line="%s\t%s\t0" % (sample['Project'], sample['Sample']), column_names=('Project','Sample',lane))) # Sort into order samples = sorted(samples, key=lambda x: (x['Project'],x['Sample'])) try: total_reads = sum([int(s[lane]) for s in samples]) except Exception as ex: for s in samples: try: int(s[lane]) except ValueError: logging.critical("Bad value for read count in " "lane %s sample %s: '%s'" % (lane,s['Sample'],s[lane])) raise ex fpp.write("\nLane %d\n" % lane_number) fpp.write("Total reads = %d\n" % total_reads) for sample in samples: sample_name = "%s/%s" % (sample['Project'], sample['Sample']) nreads = float(sample[lane]) if total_reads > 0: frac_reads = "%.1f%%" % (nreads/total_reads*100.0) else: frac_reads = "n/a" fpp.write("- %s\t%d\t%s\n" % (sample_name, nreads, frac_reads)) # Close file if fp is None and out_file is not None: fpp.close()
class SampleSheetLinter(SampleSheetPredictor): """ Class for checking sample sheets for problems Provides the following methods for checking different aspects of a sample sheet: - close_project_names: check if sample sheet projects look similar - samples_with_multiple_barcodes: check for samples with multiple barcodes - samples_in_multiple_projects: check for samples assigned to multiple projects - has_invalid_lines: check for invalid sample sheet lines - has_invalid_characters: check if sample sheet contains invalid characters Example usage: Initialise linter: >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt") Get closely-matching names: >>> linter.close_project_names() ... """ def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet) def walk(self): """ Traverse the list of projects and samples Generator that yields tuples consisting of (SampleSheetProject,SampleSheetSample) pairs Yields: Tuple: SampleSheetProject, SampleSheetSample pair """ for project in [self.get_project(name) for name in self.project_names]: for sample in [ project.get_sample(idx) for idx in project.sample_ids ]: yield (project, sample) def close_project_names(self): """ Return list of closely-matching project names Returns: Dictionary: keys are project names which have at least one close match; the values for each key are lists with the project names which are close matches. """ return get_close_names(self.project_names) def samples_with_multiple_barcodes(self): """ Return list of samples which have multiple associated barcodes Returns: Dictionary: keys are sample IDs which have more than one associated barcode; the values for each key are lists of the associated barcodes. """ # Look for samples with multiple barcodes multiple_barcodes = {} for project, sample in self.walk(): if len(sample.barcode_seqs) > 1: multiple_barcodes[sample.sample_id] = \ [s for s in sample.barcode_seqs] return multiple_barcodes def samples_in_multiple_projects(self): """ Return list of samples which are in multiple projects Returns: Dictionary: dictionary with sample IDs which appear in multiple projects as keys; the associated values are lists with the project names. """ # Look for samples with multiple projects samples = {} for project, sample in self.walk(): if sample.sample_id not in samples: samples[sample.sample_id] = [] samples[sample.sample_id].append(project.name) multiple_projects = {} for sample in samples: if len(samples[sample]) > 1: multiple_projects[sample] = samples[sample] return multiple_projects def has_invalid_lines(self): """ Return list of samplesheet lines which are invalid Returns: List: list of lines which are invalid (i.e. missing required data) in the sample sheet. """ # Convience variables sample_id = self._sample_sheet.sample_id_column sample_name = self._sample_sheet.sample_name_column sample_project = self._sample_sheet.sample_project_column # Look at first line to see which items have been provided line = self._sample_sheet.data[0] has_sample_id = line[sample_id] != '' has_sample_name = (sample_name is not None) and \ (line[sample_name] != '') has_project = line[sample_project] != '' # Look for invalid data lines invalid_lines = [] for line in self._sample_sheet.data: if self._sample_sheet.has_lanes and line['Lane'] == '': invalid_lines.append(line) elif has_sample_id and line[sample_id] == '': invalid_lines.append(line) elif has_sample_name and line[sample_name] == '': invalid_lines.append(line) elif has_project and line[sample_project] == '': invalid_lines.append(line) return invalid_lines def has_invalid_barcodes(self): """ Return list of lines with invalid barcodes Returns: List: list of lines which contain invalid barcode sequences in the sample sheet. """ invalid_lines = list() indices = list() for indx in ('index', 'index2'): if indx in self._sample_sheet.data.header(): indices.append(indx) if indices: for line in self._sample_sheet.data: for indx in indices: if not barcode_is_valid(line[indx]): invalid_lines.append(line) continue return invalid_lines def has_invalid_characters(self): """ Check if text file contains any 'invalid' characters In this context a character is 'invalid' if: - it is non-ASCII (decimal code > 127), or - it is a non-printing ASCII character (code < 32) Returns: Boolean: True if file contains at least one invalid character, False if all characters are valid. """ return has_invalid_characters(text=self._sample_sheet.show())
class SampleSheetLinter(SampleSheetPredictor): """ Class for checking sample sheets for problems Provides the following methods for checking different aspects of a sample sheet: - close_project_names: check if sample sheet projects look similar - samples_with_multiple_barcodes: check for samples with multiple barcodes - samples_in_multiple_projects: check for samples assigned to multiple projects - has_invalid_lines: check for invalid sample sheet lines - has_invalid_characters: check if sample sheet contains invalid characters Example usage: Initialise linter: >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt") Get closely-matching names: >>> linter.close_project_names() ... """ def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet) def walk(self): """ Traverse the list of projects and samples Generator that yields tuples consisting of (SampleSheetProject,SampleSheetSample) pairs Yields: Tuple: SampleSheetProject, SampleSheetSample pair """ for project in [self.get_project(name) for name in self.project_names]: for sample in [project.get_sample(idx) for idx in project.sample_ids]: yield (project,sample) def close_project_names(self): """ Return list of closely-matching project names Returns: Dictionary: keys are project names which have at least one close match; the values for each key are lists with the project names which are close matches. """ return get_close_names(self.project_names) def samples_with_multiple_barcodes(self): """ Return list of samples which have multiple associated barcodes Returns: Dictionary: keys are sample IDs which have more than one associated barcode; the values for each key are lists of the associated barcodes. """ # Look for samples with multiple barcodes multiple_barcodes = {} for project,sample in self.walk(): if len(sample.barcode_seqs) > 1: multiple_barcodes[sample.sample_id] = \ [s for s in sample.barcode_seqs] return multiple_barcodes def samples_in_multiple_projects(self): """ Return list of samples which are in multiple projects Returns: Dictionary: dictionary with sample IDs which appear in multiple projects as keys; the associated values are lists with the project names. """ # Look for samples with multiple projects samples = {} for project,sample in self.walk(): if sample.sample_id not in samples: samples[sample.sample_id] = [] samples[sample.sample_id].append(project.name) multiple_projects = {} for sample in samples: if len(samples[sample]) > 1: multiple_projects[sample] = samples[sample] return multiple_projects def has_invalid_lines(self): """ Return list of samplesheet lines which are invalid Returns: List: list of lines which are invalid (i.e. missing required data) in the sample sheet. """ # Convience variables sample_id = self._sample_sheet.sample_id_column sample_name = self._sample_sheet.sample_name_column sample_project = self._sample_sheet.sample_project_column # Look at first line to see which items have been provided line = self._sample_sheet.data[0] has_sample_id = line[sample_id] != '' has_sample_name = (sample_name is not None) and \ (line[sample_name] != '') has_project = line[sample_project] != '' # Look for invalid data lines invalid_lines = [] for line in self._sample_sheet.data: if self._sample_sheet.has_lanes and line['Lane'] == '': invalid_lines.append(line) elif has_sample_id and line[sample_id] == '': invalid_lines.append(line) elif has_sample_name and line[sample_name] == '': invalid_lines.append(line) elif has_project and line[sample_project] == '': invalid_lines.append(line) return invalid_lines def has_invalid_characters(self): """ Check if text file contains any 'invalid' characters In this context a character is 'invalid' if: - it is non-ASCII (decimal code > 127), or - it is a non-printing ASCII character (code < 32) Returns: Boolean: True if file contains at least one invalid character, False if all characters are valid. """ return has_invalid_characters(text=self._sample_sheet.show())
reporter = Reporter() for lane in lanes: # Report for each lane if lane not in counts.lanes: logging.error("Requested analysis for lane %d but " "only have counts for lanes %s" % (lane, ','.join([str(l) for l in counts.lanes]))) retval = 1 continue mismatches = opts.mismatches # Deal with sample sheet if supplied if sample_sheet: with tempfile.NamedTemporaryFile() as fp: # Make a temporary sample sheet with just the # requested lane s = SampleSheet(sample_sheet) if s.has_lanes: use_lanes = (lane, ) s = make_custom_sample_sheet(sample_sheet, fp.name, lanes=(lane, )) else: s = make_custom_sample_sheet(sample_sheet, fp.name) if has_chromium_sc_indices(fp.name): logging.warning("Lane %s has 10xGenomics Chromium " "indices in sample sheet; not " "matching against samplesheet for " "this lane" % lane) continue # If mismatches not set then determine from # the barcode lengths in the temporary
def setup(ap, data_dir, analysis_dir=None, sample_sheet=None, unaligned_dir=None): """ Set up the initial analysis directory This does all the initialisation of the analysis directory and processing parameters Arguments: ap (AutoProcess): autoprocessor pointing to the analysis directory to create Fastqs for data_dir (str): source data directory analysis_dir (str): corresponding analysis directory sample_sheet (str): name and location of non-default sample sheet file; can be a local or remote file, or a URL (optional, will use sample sheet from the source data directory if present) unaligned_dir (str): directory with existing Fastqs output from CASAVA or bcl2fastq2; if specified then Fastqs will be taken from this directory (optional) """ data_dir = data_dir.rstrip(os.sep) if not exists(data_dir): raise Exception("Data directory '%s' not found" % data_dir) if not Location(data_dir).is_remote: data_dir = os.path.abspath(data_dir) run_name = os.path.basename(data_dir) if analysis_dir is None: analysis_dir = os.path.join(os.getcwd(), run_name) + '_analysis' else: analysis_dir = os.path.abspath(analysis_dir) # Create the analysis directory structure if not os.path.exists(analysis_dir): # Make a temporary analysis dir tmp_analysis_dir = os.path.join( os.path.dirname(analysis_dir), ".%s.%s" % (os.path.basename(analysis_dir), uuid.uuid4())) ap.analysis_dir = tmp_analysis_dir logger.debug("Creating temp directory '%s'" % ap.analysis_dir) # Create directory structure ap.create_directory(ap.analysis_dir) ap.log_dir ap.script_code_dir else: # Directory already exists logger.warning("Analysis directory '%s' already exists" % analysis_dir) ap.analysis_dir = analysis_dir # check for parameter file if ap.has_parameter_file: ap.load_parameters() else: logger.warning("No parameter file found in %s" % ap.analysis_dir) # Run datestamp, instrument name and instrument run number try: datestamp,instrument,run_number,flow_cell_prefix,flow_cell_id = \ split_run_name_full(run_name) run_number = run_number.lstrip('0') flow_cell = flow_cell_prefix + flow_cell_id except Exception as ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) datestamp = None instrument = None run_number = None flow_cell = None # Identify missing data and attempt to acquire # Sequencing platform platform = ap.metadata.platform if platform is None: platform = get_sequencer_platform(data_dir, instrument=instrument, settings=ap.settings) print "Platform identified as '%s'" % platform # Log dir ap.set_log_dir(ap.get_log_subdir('setup')) # Attempt to acquire sample sheet try: # Custom SampleSheet.csv file custom_sample_sheet = ap.params.sample_sheet if custom_sample_sheet is not None: # Sample sheet already stored original_sample_sheet = os.path.join(ap.analysis_dir, 'SampleSheet.orig.csv') print "Sample sheet '%s'" % custom_sample_sheet else: # Look for sample sheet print "Acquiring sample sheet..." if sample_sheet is None: targets = ( 'Data/Intensities/BaseCalls/SampleSheet.csv', 'SampleSheet.csv', ) else: targets = (sample_sheet, ) # Try each possibility until one sticks for target in targets: target = Location(target) tmp_sample_sheet = os.path.join(ap.tmp_dir, os.path.basename(target.path)) if target.is_url: # Try fetching samplesheet from URL print "Trying '%s'" % target.url try: urlfp = urllib2.urlopen(target.url) with open(tmp_sample_sheet, 'w') as fp: fp.write(urlfp.read()) except urllib2.URLError as ex: # Failed to download from URL raise Exception("Error fetching sample sheet data " "from '%s': %s" % (target.url, ex)) else: # Assume target samplesheet is a file on a local # or remote server if target.is_remote: target_sample_sheet = str(target) else: if os.path.isabs(target.path): target_sample_sheet = target.path else: target_sample_sheet = os.path.join( data_dir, target.path) print "Trying '%s'" % target_sample_sheet rsync = general_applications.rsync(target_sample_sheet, ap.tmp_dir) print "%s" % rsync status = rsync.run_subprocess( log=ap.log_path('rsync.sample_sheet.log')) if status != 0: logger.warning("Failed to fetch sample sheet '%s'" % target_sample_sheet) tmp_sample_sheet = None else: break # Bail out if no sample sheet was acquired if tmp_sample_sheet is None: raise Exception("Unable to acquire sample sheet") # Keep a copy of the original sample sheet original_sample_sheet = os.path.join(ap.analysis_dir, 'SampleSheet.orig.csv') print "Copying original sample sheet to %s" % original_sample_sheet shutil.copyfile(tmp_sample_sheet, original_sample_sheet) # Set the permissions for the original SampleSheet os.chmod(original_sample_sheet, 0664) # Process acquired sample sheet custom_sample_sheet = os.path.join(ap.analysis_dir, 'custom_SampleSheet.csv') make_custom_sample_sheet(tmp_sample_sheet, custom_sample_sheet) except Exception as ex: # Failed to acquire sample sheet if not unaligned_dir: # Fatal error try: # Remove temporary directory shutil.rmtree(tmp_analysis_dir) ap.analysis_dir = None except Exception: pass raise Exception("Failed to acquire sample sheet: %s" % ex) else: # Don't need sample sheet if Fastqs already exist original_sample_sheet = None custom_sample_sheet = None # Library Prep Kit/Assay data assay = None if original_sample_sheet is not None: for item in ('Assay', 'Library Prep Kit'): try: assay = SampleSheet(original_sample_sheet).header[item] break except KeyError: logger.warning("No element '%s' found in sample sheet" % item) # Bases mask print "Bases mask set to 'auto' (will be determined at run time)" bases_mask = "auto" # Data source metadata data_source = ap.settings.metadata.default_data_source # Generate and print predicted outputs and warnings if custom_sample_sheet is not None: sample_sheet_data = SampleSheet(custom_sample_sheet) print predict_outputs(sample_sheet=sample_sheet_data) check_and_warn(sample_sheet=sample_sheet_data) # Check supplied unaligned Fastq dir if unaligned_dir is not None: try: illumina_data = IlluminaData(data_dir, unaligned_dir=unaligned_dir) unaligned_dir = illumina_data.unaligned_dir except IlluminaDataError: # Fatal error try: # Remove temporary directory shutil.rmtree(tmp_analysis_dir) ap.analysis_dir = None except Exception: pass raise Exception("Can't get data from Fastq dir '%s'" % unaligned_dir) else: # No unaligned dir supplied unaligned_dir = ap.params.unaligned_dir # Move analysis dir to final location if necessary if ap.analysis_dir != analysis_dir: logger.debug("Moving %s to final directory" % ap.analysis_dir) os.rename(ap.analysis_dir, analysis_dir) ap.analysis_dir = analysis_dir # Update the custom sample sheet path if custom_sample_sheet is not None: custom_sample_sheet = os.path.join( analysis_dir, os.path.basename(custom_sample_sheet)) print "Created analysis directory '%s'" % ap.analysis_dir # Store the parameters ap.params['data_dir'] = data_dir ap.params['analysis_dir'] = ap.analysis_dir ap.params['sample_sheet'] = custom_sample_sheet ap.params['bases_mask'] = bases_mask ap.params['unaligned_dir'] = unaligned_dir ap.params['acquired_primary_data'] = False # Store the metadata ap.metadata['run_name'] = ap.run_name ap.metadata['platform'] = platform ap.metadata['instrument_name'] = instrument ap.metadata['instrument_datestamp'] = datestamp ap.metadata['instrument_run_number'] = run_number ap.metadata['instrument_flow_cell_id'] = flow_cell ap.metadata['assay'] = assay ap.metadata['source'] = data_source # Make a 'projects.info' metadata file if unaligned_dir is not None: ap.make_project_metadata_file() # Set flags to allow parameters etc to be saved back ap._save_params = True ap._save_metadata = True
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, name=None, lanes=None, lane_subsets=None, icell8_well_list=None, nprocessors=None, bcl_converter=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, trim_adapters=True, adapter_sequence=None, adapter_sequence_read2=None, create_fastq_for_index_read=None, find_adapters_with_sliding_window=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, force_copy_of_primary_data=False, create_empty_fastqs=False, runner=None, icell8_swap_i1_and_i2=False, icell8_reverse_complement=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False, max_jobs=None, max_cores=None, batch_limit=None, verbose=False, working_dir=None): """ Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics - analyse barcodes If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet name (str): (optional) identifier for outputs that are not set explicitly lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) lane_subsets (list): (optional) specify a list of lane subsets to process separately before merging at the end; each subset is a dictionary which should be generated using the 'subset' function, and can include custom values for processing parameters (e.g. protocol, trimming and masking options etc) to override the defaults for this lane. Lanes not in a subset will still be processed unless excluded via the 'lanes' keyword icell8_well_list (str): well list file for ICELL8 platforms (required for ICELL8 processing protocols) nprocessors (int) : number of processors to use generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs bcl_converter (str): default BCL-to-Fastq conversion software to use; optionally can include a version specification (e.g. "bcl2fastq>2.0" or "bcl-convert=3.7.5"). Defaults to "bcl2fastq" bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. trim_adapters (boolean): if True (the default) then pass adapter sequence(s) to bcl2fastq to perform adapter trimming; otherwise remove adapter sequences adapter_sequence (str): if not None then specifies adapter sequence to use instead of any sequences already set in the samplesheet (nb will be ignored if 'trim_adapters' is False) adapter_sequence_read2 (str): if not None then specifies adapter sequence to use for read2 instead of any sequences already set in the samplesheet (nb will be ignored if 'trim_adapters' is False) create_fastq_for_index_reads (boolean): if True then also create Fastq files for index reads (default, don't create index read Fastqs) find_adapters_with_sliding_window (boolean): if True then use sliding window algorithm to identify adapter sequences for trimming stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis force_copy_of_primary_data (bool): if True then force primary data to be copied (rsync'ed) even if it's on the local system (default is to link to primary data unless it's on a remote filesystem). create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation icell8_swap_i1_and_i2 (bool): if True then swap I1 and I2 reads when matching to barcodes in the ICELL8 well list (ICELL8 ATAC data only) icell8_reverse_complement (str): one of 'i1', 'i2', 'both', or None; if set then the specified index reads will be reverse complemented when matching to barcodes in the ICELL8 well list (ICELL8 ATAC data only) cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) max_jobs (int): maximum number of concurrent jobs allowed max_cores (int): maximum number of cores available batch_limit (int): if set then run commands in each task in batches, with the batch size set dyanmically so as not to exceed this limit working_dir (str): path to a working directory (defaults to temporary directory in the current directory) verbose (bool): if True then report additional information for pipeline diagnostics """ # Report protocol print("Protocol : %s" % protocol) if protocol not in PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join(PROTOCOLS))) # Output (unaligned) dir if not unaligned_dir and name: unaligned_dir = 'bcl2fastq_%s' % name if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print("Output dir : %s" % ap.params.unaligned_dir) # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print("Source sample sheet : %s" % ap.params.sample_sheet) # Check requested lanes are actually present print("Lanes : %s" % ('all' if lanes is None else ','.join([str(l) for l in lanes]))) if lanes is not None: s = SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Barcode analysis if not barcode_analysis_dir and name: barcode_analysis_dir = 'barcode_analysis_%s' % name if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Statistics files if stats_file is None: if name: stats_file = 'statistics.%s.info' % name elif ap.params['stats_file'] is not None: stats_file = ap.params['stats_file'] else: stats_file = 'statistics.info' if per_lane_stats_file is None: if name: per_lane_stats_file = 'per_lane_statistics.%s.info' % name elif ap.params['per_lane_stats_file'] is not None: per_lane_stats_file = ap.params['per_lane_stats_file'] else: per_lane_stats_file = 'per_lane_statistics.info' # Log dir log_dir = 'make_fastqs' if name: log_dir += "_%s" % name if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Pipeline log file pipeline_log = os.path.join(ap.log_dir, "make_fastqs.log") # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask # Default trimming/masking values if minimum_trimmed_read_length is None: minimum_trimmed_read_length = \ BCL2FASTQ_DEFAULTS['minimum_trimmed_read_length'] if mask_short_adapter_reads is None: mask_short_adapter_reads = \ BCL2FASTQ_DEFAULTS['mask_short_adapter_reads'] # Get platform if not platform: platform = ap.metadata.platform # Set options from supplied arguments, platform-specific settings # and configured defaults defaults = { 'bcl_converter': bcl_converter, 'nprocessors': nprocessors, 'no_lane_splitting': no_lane_splitting, 'create_empty_fastqs': create_empty_fastqs, } for item in ( 'bcl_converter', 'nprocessors', 'no_lane_splitting', 'create_empty_fastqs', ): if defaults[item] is None: value = None if platform in ap.settings.platform: value = ap.settings.platform[platform][item] if value is None: value = ap.settings.bcl_conversion[item] defaults[item] = value # BCL converter if defaults['bcl_converter'] is None: bcl_converter = BCL2FASTQ_DEFAULTS['bcl_converter'] else: bcl_converter = defaults['bcl_converter'] # Number of processors nprocessors = defaults['nprocessors'] # Split Fastqs across lanes no_lane_splitting = defaults['no_lane_splitting'] # Create empty Fastqs create_empty_fastqs = defaults['create_empty_fastqs'] # Set up pipeline runners default_runner = ap.settings.general.default_runner runners = { 'rsync_runner': ap.settings.runners.rsync, 'bcl2fastq_runner': ap.settings.runners.bcl2fastq, 'bclconvert_runner': ap.settings.runners.bcl_convert, 'demultiplex_icell8_atac_runner': ap.settings.runners.bcl2fastq, 'cellranger_runner': ap.settings.runners.cellranger, 'cellranger_atac_runner': ap.settings.runners.cellranger, 'cellranger_arc_runner': ap.settings.runners.cellranger, 'spaceranger_runner': ap.settings.runners.cellranger, 'stats_runner': ap.settings.runners.stats, } if runner is not None: # Override configured runners default_runner = runner for r in runners: runner[r] = runner # Set up pipeline environment modules envmodules = {} for envmod in ( 'bcl2fastq', 'bcl_convert', 'cellranger_mkfastq', 'cellranger_atac_mkfastq', 'cellranger_arc_mkfastq', 'spaceranger_mkfastq', ): try: envmodules[envmod] = ap.settings.modulefiles[envmod] except KeyError: try: envmodules[envmod] = ap.settings.modulefiles['make_fastqs'] except KeyError: envmodules[envmod] = None # Other pipeline settings poll_interval = ap.settings.general.poll_interval # Construct and run pipeline make_fastqs = MakeFastqs(ap.params.data_dir, ap.params.sample_sheet, protocol=protocol, bases_mask=bases_mask, bcl_converter=bcl_converter, platform=platform, icell8_well_list=icell8_well_list, minimum_trimmed_read_length=\ minimum_trimmed_read_length, mask_short_adapter_reads=\ mask_short_adapter_reads, adapter_sequence=adapter_sequence, adapter_sequence_read2=\ adapter_sequence_read2, icell8_atac_swap_i1_and_i2=\ icell8_swap_i1_and_i2, icell8_atac_reverse_complement=\ icell8_reverse_complement, lane_subsets=lane_subsets, lanes=lanes, trim_adapters=trim_adapters, fastq_statistics=generate_stats, analyse_barcodes=analyse_barcodes) status = make_fastqs.run(ap.analysis_dir, name=name, out_dir=ap.params.unaligned_dir, barcode_analysis_dir=barcode_analysis_dir, primary_data_dir=ap.params.primary_data_dir, force_copy_of_primary_data=\ force_copy_of_primary_data, no_lane_splitting=no_lane_splitting, create_fastq_for_index_read=\ create_fastq_for_index_read, find_adapters_with_sliding_window=\ find_adapters_with_sliding_window, create_empty_fastqs=create_empty_fastqs, stats_file=stats_file, per_lane_stats=per_lane_stats_file, nprocessors=nprocessors, default_runner=default_runner, cellranger_jobmode=cellranger_jobmode, cellranger_mempercore=cellranger_mempercore, cellranger_maxjobs=cellranger_maxjobs, cellranger_jobinterval=\ cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, runners=runners, envmodules=envmodules, log_dir=ap.log_dir, log_file=pipeline_log, max_jobs=max_jobs, max_slots=max_cores, batch_limit=batch_limit, poll_interval=poll_interval, working_dir=working_dir, verbose=verbose) # Update the parameters ap.params['primary_data_dir'] = make_fastqs.output.primary_data_dir ap.params['acquired_primary_data'] = \ make_fastqs.output.acquired_primary_data ap.params['stats_file'] = make_fastqs.output.stats_file ap.params['per_lane_stats_file'] = make_fastqs.output.per_lane_stats for param in ('stats_file', 'per_lane_stats_file'): filen = ap.params[param] if filen is not None: if filen.startswith(ap.analysis_dir): ap.params[param] = os.path.relpath(filen, ap.analysis_dir) # Update the metadata if status == 0: # Platform ap.metadata['platform'] = make_fastqs.output.platform # Software used for processing try: processing_software = ast.literal_eval( ap.metadata.processing_software) except ValueError: processing_software = dict() outputs = make_fastqs.output if outputs.bcl2fastq_info: processing_software['bcl2fastq'] = outputs.bcl2fastq_info if outputs.bclconvert_info: processing_software['bcl-convert'] = outputs.bclconvert_info if outputs.cellranger_info: processing_software['cellranger'] = outputs.cellranger_info if outputs.cellranger_atac_info: processing_software['cellranger-atac'] = \ outputs.cellranger_atac_info if outputs.cellranger_arc_info: processing_software['cellranger-arc'] = \ outputs.cellranger_arc_info if outputs.spaceranger_info: processing_software['spaceranger'] = outputs.spaceranger_info ap.metadata['processing_software'] = processing_software # Legacy metadata items ap.metadata['bcl2fastq_software'] = make_fastqs.output.bcl2fastq_info ap.metadata['cellranger_software'] = make_fastqs.output.cellranger_info # Make a file listing missing Fastqs if make_fastqs.output.missing_fastqs: missing_fastqs_log = os.path.join(ap.log_dir, "missing_fastqs.log") with open(missing_fastqs_log, 'wt') as fp: for fq in make_fastqs.output.missing_fastqs: fp.write("%s\n" % fq) print("Wrote list of missing Fastq files to '%s'" % missing_fastqs_log) # Raise exception on failure if status != 0: raise Exception("Fastq generation failed") # Make or update 'projects.info' metadata file if not ap.params.project_metadata: ap.make_project_metadata_file() else: ap.update_project_metadata_file() ap.save_data() # Finish return status