def report_summary(ap): """Generate summary report suitable for bioinformaticians Generates a multi-line report which gives general information about the run, plus one-line summaries for each project, plus any additional information that has been recorded. The general information includes: - Platform - Run name - Run reference id - Processing software - Assay (i.e. sequencing kit) For each project: - Project subdirectory - Researcher (aka user) - PI - Application (aka library type) - Single cell prep platform (e.g. ICell8) - Organism - Number of samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Default items to report report_items = [ 'Run name', 'Reference', 'Platform', 'Directory', 'Endedness', 'Bcl2fastq', ] # Gather information analysis_dir = analysis.AnalysisDir(ap.analysis_dir) datestamp = None instrument = None run_number = None run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex)
def run_reference_id(run_name, platform=None, facility_run_number=None): """Return a run reference id e.g. 'HISEQ_140701/242#22' The run reference code is a code that identifies the sequencing run, and has the general form: PLATFORM_DATESTAMP[/INSTRUMENT_RUN_NUMBER]#FACILITY_RUN_NUMBER - PLATFORM is always uppercased e.g. HISEQ, MISEQ, GA2X - DATESTAMP is the YYMMDD code e.g. 140701 - INSTRUMENT_RUN_NUMBER is the run number that forms part of the run directory e.g. for '140701_SN0123_0045_000000000-A1BCD' it is '45' - FACILITY_RUN_NUMBER is the run number that has been assigned by the facility Note that the instrument run number is only used if it differs from the facility run number. If the platform isn't supplied then the instrument name is used instead, e.g.: 'SN0123_140701/242#22' If the run name can't be split into components then the general form will be: [PLATFORM_]RUN_NAME[#FACILITY_RUN_NUMBER] depending on whether platform and/or facility run number have been supplied. For example for a run called 'rag_05_2017': 'MISEQ_rag_05_2017#90' Arguments: run_name (str): the run name (can be a path) platform (str): the platform name (optional) facility_run_number (int): the run number assigned by the local facility (can be different from the instrument run number) (optional) """ # Extract information from run name run_name = os.path.basename(os.path.normpath(run_name)) try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) instrument = None date_stamp = None run_number = None
def report_projects(ap): """Generate one line reports suitable for pasting into spreadsheet Generate one-line report for each each project with tab-separated data items, suitable for injection into a spreadsheet. Each line has the following information: - Run id e.g. HISEQ_140328 - Run number - Source - Date - User - PI - Application - Single Cell Platform - Organism - Platform - #Samples - #Cells - PE (yes/no) - Samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Acquire data analysis_dir = utils.AnalysisDir(ap.analysis_dir) # General information run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) run_number = run_number.lstrip('0') except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) date_stamp = '' run_number = ''
def fetch_value(ap, project, field): """ Return the value of the supplied field Given a field name, return the value determined from the data in the supplied AutoProcessor and AnalysisProject instances. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on project (AnalysisProject): project to report on field (str): field name to return value of Returns: String: value of supplied field. """ # Convenience variable for project info try: info = project.info except AttributeError: info = None # Generate value for supplied field name if field == 'datestamp': return IlluminaData.split_run_name(ap.run_name)[0] elif field == 'run_id': return ap.run_reference_id elif field == 'run_number': return ('' if not ap.metadata.run_number else str(ap.metadata.run_number)) elif field == 'source' or field == 'data_source': return ('' if not ap.metadata.source else ap.metadata.source) elif field == 'analysis_dir' or field == 'path': return ap.params.analysis_dir elif field == 'project' or field == 'project_name': return project.name elif field == 'user': return ('' if not info.user else info.user) elif field == 'PI' or field == 'pi': return ('' if not info.PI else info.PI) elif field == 'application' or field == 'library_type': return ('' if not info.library_type else info.library_type) elif field == 'single_cell_platform': return ('' if not info.single_cell_platform else info.single_cell_platform) elif field == 'organism': return ('' if not info.organism else info.organism) elif field == 'sequencer_platform' or field == 'platform': return ('' if not ap.metadata.platform else str( ap.metadata.platform).upper()) elif field == 'sequencer_model': return ('' if not ap.metadata.sequencer_model else ap.metadata.sequencer_model) elif field == 'no_of_samples' or field == '#samples': return str(len(project.samples)) elif field == 'no_of_cells' or field == '#cells': return ('' if not info.number_of_cells else str(info.number_of_cells)) elif field == 'paired_end': return ('yes' if ap.paired_end else 'no') elif field == 'sample_names' or field == 'samples': return project.prettyPrintSamples() elif field == 'null' or field == '': return '' else: raise KeyError("'%s': unrecognised field for reporting" % field)
def report_summary(ap): """Generate summary report suitable for bioinformaticians Generates a multi-line report which gives general information about the run, plus one-line summaries for each project, plus any additional information that has been recorded. The general information includes: - Platform - Run name - Run reference id - Sequencer model - Processing software For each project: - Project subdirectory - Researcher (aka user) - PI - Application (aka library type) - Single cell prep platform (e.g. ICell8) - Organism - Number of samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Default items to report report_items = [ 'Run name', 'Reference', 'Platform', 'Sequencer', 'Directory', 'Endedness', 'Bcl2fastq', ] # Gather information analysis_dir = analysis.AnalysisDir(ap.analysis_dir) datestamp = None instrument = None run_number = None run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception as ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) if ap.metadata.platform is not None: platform = ap.metadata.platform.upper() else: platform = 'unknown' if ap.metadata.run_number is not None: run_number = ap.metadata.run_number # Processing software information try: processing_software = ast.literal_eval(ap.metadata.processing_software) except ValueError: processing_software = dict() if not processing_software: # Fallback to legacy metadata items try: processing_software['bcl2fastq'] = ast.literal_eval( ap.metadata.bcl2fastq_software) except ValueError: pass try: processing_software['cellranger'] = ast.literal_eval( ap.metadata.cellranger_software) except ValueError: pass for pkg in ('cellranger', 'cellranger-atac'): if pkg in processing_software: report_items.append(pkg.title()) # Generate report text report = [] # Report header if datestamp and instrument and run_number: title = "%s run #%s datestamped %s" % (platform, run_number, datestamp) else: title = "%s" % os.path.basename(ap.analysis_dir) report.append("%s\n%s" % (title, '=' * len(title))) # General information field_width = max([len(i) for i in report_items]) for item in report_items: # Get the value for each item if item == 'Run name': value = run_name elif item == 'Reference': value = ap.run_reference_id elif item == 'Platform': value = platform elif item == 'Sequencer': value = ap.metadata.sequencer_model elif item == 'Directory': value = ap.params.analysis_dir elif item == 'Endedness': value = ('Paired end' if analysis_dir.paired_end else 'Single end') elif item == 'Bcl2fastq': if 'bcl2fastq' in processing_software: value = "%s %s" % (processing_software['bcl2fastq'][1], processing_software['bcl2fastq'][2]) else: value = 'Unknown' elif item == 'Cellranger': if 'cellranger' in processing_software: value = "%s %s" % (processing_software['cellranger'][1], processing_software['cellranger'][2]) else: value = 'Unknown' elif item == 'Cellranger-Atac': if 'cellranger-atac' in processing_software: value = "%s %s" % (processing_software['cellranger-atac'][1], processing_software['cellranger-atac'][2]) else: value = 'Unknown' else: raise Exception("Unknown reporting item '%s'" % item) # Append a line reporting the value report.append("%s%s: %s" % (item, ' ' * (field_width - len(item)), value)) report.append("") # Projects rows = [] comments = bcf_utils.OrderedDictionary() if analysis_dir.n_projects != 0: report.append("%d project%s:" % (analysis_dir.n_projects, '' if analysis_dir.n_projects == 1 else 's')) data_items = ('user', 'PI', 'library_type', 'single_cell_platform', 'number_of_cells', 'organism') for project in analysis_dir.projects: project_data = dict(project=project.name) for item in data_items: value = project.info[item] project_data[item] = value if value not in ('.','?') else \ '<unspecified %s>' % item.lower() library = project_data['library_type'] if project_data['single_cell_platform'] is not None: library += " (%s)" % project_data['single_cell_platform'] samples = "%d sample%s" % (len( project.samples), 's' if len(project.samples) != 1 else '') if project_data['number_of_cells'] is not None: samples += "/%d cell%s" % ( int(project_data['number_of_cells']), 's' if int(project_data['number_of_cells']) != 1 else '') rows.append(("- '%s':" % project_data['project'], project_data['user'], project_data['organism'], library, samples, "(PI %s)" % project_data['PI'])) if project.info.comments: comments[project.name] = project.info.comments report.append(utils.pretty_print_rows(rows)) else: # No projects - try loading data from unaligned dir try: illumina_data = ap.load_illumina_data() report.append("No projects found; '%s' directory contains " "the following data:\n" % ap.params.unaligned_dir) for project in illumina_data.projects: rows.append(("- '%s':" % project.name, "%s sample%s" % (len(project.samples), 's' if len(project.samples) != 1 else ''))) report.append(utils.pretty_print_rows(rows)) except IlluminaData.IlluminaDataError as ex: report.append("No projects found") # Additional comments/notes if comments: width = max([len(x) for x in comments]) report.append("") report.append("Additional notes/comments:") for project in comments: first_line = True for line in bcf_utils.split_into_lines(comments[project], 70 - width): if first_line: report.append("- %s%s: %s" % (project, ' ' * (width - len(project)), line)) first_line = False else: report.append(" %s %s" % (' ' * width, line)) return '\n'.join(report)
def __init__(self, analysis_dir): """Create a new AnalysisDir instance for a specified directory Arguments: analysis_dir: name (and path) to analysis directory """ # Store location self._analysis_dir = os.path.abspath(analysis_dir) self._name = os.path.basename(analysis_dir) self._bcl2fastq_dirs = [] self._project_dirs = [] self._extra_dirs = [] self.sequencing_data = [] self.projects = [] self.undetermined = None # Metadata self.metadata = AnalysisDirMetadata() try: metadata_file = os.path.join(self._analysis_dir, "metadata.info") self.metadata.load(metadata_file) except Exception as ex: logger.warning("Failed to load metadata file %s: %s" % (metadata_file, ex)) logger.warning("Attempting to load parameter file") try: params = AnalysisDirParameters() parameter_file = os.path.join(self._analysis_dir, "auto_process.info") params.load(parameter_file, strict=False) # Attempt to acquire values from parameters for param in ('platform', 'run_number', 'source', 'assay'): if param not in params: print "-- %s: missing" % param continue print "-- %s: setting to '%s'" % (param, params[param]) self.metadata[param] = params[param] except Exception as ex: # No parameter file either logger.warning("Failed to load parameters: %s" % ex) logger.warning("Perhaps this is not an auto_process project?") raise ex # Projects metadata try: self.projects_metadata = ProjectMetadataFile( os.path.join(self._analysis_dir, "projects.info")) except Exception as ex: logger.warning("Failed to load projects metadata: %s" % ex) self.projects_metadata = None # Run name try: self.run_name = self.metadata.run except AttributeError: self.run_name = self._analysis_dir[0:-len('_analysis')] self.run_name = os.path.basename(self.run_name) self.date_stamp,\ self.instrument_name,\ self.instrument_run_number = IlluminaData.split_run_name( self.run_name) # Look for outputs from bclToFastq and analysis projects logger.debug("Examining subdirectories of %s" % self._analysis_dir) for dirn in bcf_utils.list_dirs(self._analysis_dir): # Look for sequencing data try: data = IlluminaData.IlluminaData(self._analysis_dir, unaligned_dir=dirn) logger.debug("- %s: sequencing data" % dirn) self._bcl2fastq_dirs.append(dirn) self.sequencing_data.append(data) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logger.warning("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Look for analysis data data = AnalysisProject(dirn, os.path.join(self._analysis_dir, dirn)) if data.is_analysis_dir: if dirn == 'undetermined': logger.debug("- %s: undetermined indexes" % dirn) self.undetermined = data else: # Check against projects.info, if possible try: if not self.projects_metadata.lookup('Project', dirn): logger.debug("- %s: not in projects.info" % dirn) self._extra_dirs.append(dirn) continue except AttributeError: pass logger.debug("- %s: project directory" % dirn) self._project_dirs.append(dirn) self.projects.append(data) continue else: # Unidentified contents self._extra_dirs.append(dirn) logger.debug("- %s: unknown" % dirn)