def __init__(self,fastq_strand_out): """ Create a new Fastqstrand instance """ self._fastq_strand_out = os.path.abspath(fastq_strand_out) self._version = None self._genomes = AttributeDictionary() # Read in data tabfile = None with open(self._fastq_strand_out,'r') as fp: for line in fp: line = line.strip() if line.startswith('#fastq_strand version:'): self._version = line.split()[2] continue elif line.startswith('#Genome'): tabfile = TabFile(column_names=line[1:].split('\t')) continue tabfile.append(tabdata=line) # Check there is some data if tabfile is None: raise Exception("Unable to extract fastq_strand data from %s" % self._fastq_strand_out) # Copy data to main object for line in tabfile: # Store the data data = AttributeDictionary() self._genomes[line['Genome']] = data data['forward'] = line['1st forward'] data['reverse'] = line['2nd reverse'] # Additional processing if data.reverse > 0.0: ratio = float(data.forward)/float(data.reverse) elif data.forward > 0.0: ratio = float("+inf") else: ratio = None if ratio is not None: if ratio < 0.2: strandedness = "reverse" elif ratio > 5 or ratio == float("+inf"): strandedness = "forward" else: strandedness = "unstranded?" else: strandedness = "undetermined" data['ratio'] = ratio data['strandedness'] = strandedness
def get_organism_config(self, section=None, config=None): """ Retrieve 'organism' configuration options from .ini file Given the name of a section (e.g. 'organism:Human'), fetch the data association with the organism and return in an AttributeDictionary object. The items that can be extracted are: - star_index (str, path to STAR index) - bowtie_index (str, path to Bowtie index) - cellranger_reference (str) - cellranger_premrna_reference (str) - cellranger_atac_reference (str) - cellranger_arc_reference (str) Arguments: section (str): name of the section to retrieve the settings from config (Config): Config object with settings loaded Returns: AttributeDictionary: dictionary of option:value pairs. """ values = AttributeDictionary() for param in ('star_index', 'bowtie_index', 'cellranger_reference', 'cellranger_premrna_reference', 'cellranger_atac_reference', 'cellranger_arc_reference'): if section and config: values[param] = config.get(section, param, None) else: values[param] = None return values
def fetch_protocol_definition(name): """ Return the definition for a QC protocol Arguments: name (str): name of the QC protocol Returns: Tuple: definition as a tuple of the form (reads,qc_modules) where 'reads' is an AttributeDictionary with elements 'seq_data', 'index', and 'qc' (listing sequence data, index reads, and all reads for QC, respectively) and 'qc_modules' is a list of QC module definitions. """ if name not in QC_PROTOCOLS: raise KeyError("%s: undefined QC protocol" % name) protocol_defn = QC_PROTOCOLS[name] reads = AttributeDictionary() try: reads['seq_data'] = list(protocol_defn['reads']['seq_data']) reads['index'] = list(protocol_defn['reads']['index']) reads['qc'] = sorted(reads.seq_data + reads.index) qc_modules = [m for m in protocol_defn['qc_modules']] except KeyError as ex: raise Exception("%s: exception loading QC protocol " "definition: %s" % (name, ex)) return (reads, qc_modules)
def add_section(self, section): """ Add a new section Arguments: section (str): an identifier of the form SECTION[:SUBSECTION] which specifies the section to add """ try: section, subsection = section.split(':') if section not in self._sections: self.add_section(section) getattr(self, section)[subsection] = AttributeDictionary() except ValueError: self._sections.append(section) setattr(self, section, AttributeDictionary())
def get_bcl2fastq_config(self,section,config): """ Retrieve bcl2fastq configuration options from .ini file Given the name of a section (e.g. 'blc2fastq', 'platform:miseq'), fetch the bcl2fastq settings and return in an AttributeDictionary object. The options that can be extracted are: - default_version - bcl2fastq - nprocessors - no_lane_splitting - create_empty_fastqs Arguments: section (str): name of the section to retrieve the settings from config (Config): Config object with settings loaded Returns: AttributeDictionary: dictionary of option:value pairs. """ values = AttributeDictionary() if section == 'bcl2fastq': values['default_version'] = config.get(section,'default_version', None) values['nprocessors'] = config.getint(section,'nprocessors',1) values['no_lane_splitting'] = config.getboolean(section,'no_lane_splitting', False) values['create_empty_fastqs'] = config.getboolean( section, 'create_empty_fastqs', True) else: values['bcl2fastq'] = config.get(section,'bcl2fastq',None) values['nprocessors'] = config.getint(section,'nprocessors',None) values['no_lane_splitting'] = config.getboolean(section,'no_lane_splitting', None) values['create_empty_fastqs'] = config.getboolean( section, 'create_empty_fastqs', None) return values
def get_sequencer_config(self, section, config): """ Retrieve 'sequencer' configuration options from .ini file Given the name of a section (e.g. 'sequencer:SN7001250'), fetch the data associated with the sequencer instrument and return in an AttributeDictionary object. The items that can be extracted are: - platform (compulsory, str) - model (str, default 'None') Arguments: section (str): name of the section to retrieve the settings from config (Config): Config object with settings loaded Returns: AttributeDictionary: dictionary of option:value pairs. """ values = AttributeDictionary() values['platform'] = config.get(section, 'platform', None) values['model'] = config.get(section, 'model', None) if values['platform'] is None: raise Exception("%s: missing required 'platform'" % section) if values['model']: # Strip quotes model = values['model'] while model[0] in ( '"', '\'', ) and model[-1] in ( '"', '\'', ): model = model[1:-1] values['model'] = model return values
def get_destination_config(self, section, config): """ Retrieve 'destination' configuration options from .ini file Given the name of a section (e.g. 'destination:webserver'), fetch the associated data transfer settings and return in an AttributeDictionary object. The options that can be extracted are: - directory (compulsory, str) - subdir (optional, str, default 'None') - readme_template (optional, str, default 'None') - url (optional, str, default 'None') - include_downloader (optional, boolean, default 'False') - include_qc_report (optional, boolean, default 'False') - hard_links (optional, boolean, default 'False') Arguments: section (str): name of the section to retrieve the settings from config (Config): Config object with settings loaded Returns: AttributeDictionary: dictionary of option:value pairs. """ values = AttributeDictionary() values['directory'] = config.get(section, 'directory', None) values['subdir'] = config.get(section, 'subdir', None) values['readme_template'] = config.get(section, 'readme_template', None) values['url'] = config.get(section, 'url', None) values['include_downloader'] = config.getboolean( section, 'include_downloader', False) values['include_qc_report'] = config.getboolean( section, 'include_qc_report', False) values['hard_links'] = config.getboolean(section, 'hard_links', False) return values
def analyse(self,lane=None,sample_sheet=None,cutoff=None, mismatches=0): """ Analyse barcode frequencies Returns a dictionary with the following keys: - barcodes: list of barcodes (or reference barcodes, if mismatches > 0) - cutoff: the specified cutoff fraction - mismatches: the specified number of mismatches to allow - total_reads: the total number of reads for the specified lane (or all reads, if no lane was specified) - coverage: the number of reads after cutoffs have been applied - counts: dictionary with barcodes from the 'barcodes' list as keys; each key points to a dictionary with keys: * reads: number of reads associated with this barcode (or group, if mismatches > 0) * sample: name of the associated sample (if a sample sheet was supplied, otherwise 'None') * sequences: number of sequences in the group (always 1 if mismatches == 0) Arguments: lane (integer): lane to restrict analysis to (None analyses all lanes) sample_sheet (str): sample sheet file to compare barcodes against (None skips comparison) cutoff (float): if mismatches == 0 then barcodes must have at least this fraction of reads to be included; (if mismatches > 0 then this condition is applied to groups instead) """ sample_lookup = {} if sample_sheet is not None: sample_sheet = SampleSheetBarcodes(sample_sheet) sample_sheet_barcodes = sample_sheet.barcodes(lane) else: sample_sheet_barcodes = None if not mismatches: groups = None barcodes = self.filter_barcodes(cutoff=cutoff,lane=lane) else: groups = self.group(lane,mismatches=mismatches, seed_barcodes=sample_sheet_barcodes, cutoff=cutoff) barcodes = [grp.reference for grp in groups] analysis = AttributeDictionary( barcodes=barcodes, cutoff=cutoff, counts=dict(), total_reads=self.nreads(lane=lane), mismatches=mismatches ) cum_reads = 0 if groups: for group in groups: barcode = group.reference barcode_reads = group.counts cum_reads += barcode_reads try: # Exact match sample = sample_sheet.lookup_sample(barcode,lane) except KeyError: # Closest match(es) sample = [] for seq in sample_sheet.barcodes(lane): if group.match(seq,mismatches): sample.append(sample_sheet.lookup_sample(seq,lane)) if sample: sample = ','.join(sample) else: sample = None except AttributeError: # No sample sheet sample = None analysis.counts[barcode] = AttributeDictionary( reads=barcode_reads, sample=sample, sequences=len(group) ) else: for barcode in barcodes: barcode_reads = self.counts(barcode,lane) cum_reads += barcode_reads try: sample = sample_sheet.lookup_sample(barcode,lane) except (KeyError,AttributeError): sample = None analysis.counts[barcode] = AttributeDictionary( reads=barcode_reads, sample=sample, sequences=1 ) analysis['coverage'] = cum_reads return analysis
def __init__(self, settings_file=None): """ Create new Settings instance If 'settings_file' is specified then this should be the full path to an appropriately formatted '.ini' file. Otherwise the class will attempt to locate an appropriate file to use: by default this will be a file called 'auto_process.ini' which will exist somewhere in the search path defined by the 'locate_settings_file' function; if no file with this name can be found then the class will fallback to looking for a file with the older 'settings.ini' file name. """ # Initialise list of sections self._sections = [] # Locate settings file if settings_file is None: # Look for default self.settings_file = locate_settings_file(name="auto_process.ini", create_from_sample=False) if self.settings_file is None: # Fallback to old name self.settings_file = locate_settings_file( name="settings.ini", create_from_sample=False) else: self.settings_file = os.path.abspath(settings_file) # Import site-specific settings from local version config = Config() if self.settings_file: config.read(self.settings_file) else: # Look for sample settings file config.read( os.path.join(get_config_dir(), 'auto_process.ini.sample')) # General parameters self.add_section('general') default_runner = config.get('general', 'default_runner', 'SimpleJobRunner') self.general['default_runner'] = config.getrunner( 'general', 'default_runner', 'SimpleJobRunner') self.general['max_concurrent_jobs'] = config.getint( 'general', 'max_concurrent_jobs', 12) self.general['max_cores'] = config.getint('general', 'max_cores') self.general['max_batches'] = config.getint('general', 'max_batches') self.general['poll_interval'] = config.getfloat( 'general', 'poll_interval', 5) # modulefiles self.add_section('modulefiles') self.modulefiles['make_fastqs'] = config.get('modulefiles', 'make_fastqs') self.modulefiles['bcl2fastq'] = config.get('modulefiles', 'bcl2fastq') self.modulefiles['bcl_convert'] = config.get('modulefiles', 'bcl_convert') self.modulefiles['cellranger_mkfastq'] = config.get( 'modulefiles', 'cellranger_mkfastq') self.modulefiles['cellranger_atac_mkfastq'] = config.get( 'modulefiles', 'cellranger_atac_mkfastq') self.modulefiles['cellranger_arc_mkfastq'] = config.get( 'modulefiles', 'cellranger_arc_mkfastq') self.modulefiles['spaceranger_mkfastq'] = config.get( 'modulefiles', 'spaceranger_mkfastq') self.modulefiles['run_qc'] = config.get('modulefiles', 'run_qc') self.modulefiles['publish_qc'] = config.get('modulefiles', 'publish_qc') self.modulefiles['process_icell8'] = config.get( 'modulefiles', 'process_icell8') self.modulefiles['fastqc'] = config.get('modulefiles', 'fastqc') self.modulefiles['fastq_screen'] = config.get('modulefiles', 'fastq_screen') self.modulefiles['fastq_strand'] = config.get('modulefiles', 'fastq_strand') self.modulefiles['cellranger'] = config.get('modulefiles', 'cellranger') self.modulefiles['report_qc'] = config.get('modulefiles', 'report_qc') self.modulefiles['cutadapt'] = config.get('modulefiles', 'cutadapt') # Handle legacy 'illumina_qc' modulefile legacy_illumina_qc_modulefiles = config.get('modulefiles', 'illumina_qc') if legacy_illumina_qc_modulefiles: if not self.modulefiles['fastqc']: logger.warning("Setting 'fastqc' modulefile parameter " "using deprecated 'illumina_qc' parameter") self.modulefiles['fastqc'] = legacy_illumina_qc_modulefiles if not self.modulefiles['fastq_screen']: logger.warning("Setting 'fastq_screen' modulefile parameter " "using deprecated 'illumina_qc' parameter") self.modulefiles['fastq_screen'] = \ legacy_illumina_qc_modulefiles # conda self.add_section('conda') self.conda['enable_conda'] = config.getboolean('conda', 'enable_conda', False) self.conda['env_dir'] = config.get('conda', 'env_dir', None) if self.conda['env_dir']: self.conda['env_dir'] = os.path.expandvars(self.conda.env_dir) # bcl_conversion self.add_section('bcl_conversion') # Add settings from legacy bcl2fastq section first self.bcl_conversion = self.get_bcl_converter_config( 'bcl2fastq', config) # Update with settings from bcl_conversion section self.get_bcl_converter_config('bcl_conversion', config, self.bcl_conversion) # qc self.add_section('qc') self.qc['nprocessors'] = config.getint('qc', 'nprocessors', None) self.qc['fastq_screens'] = config.get('qc', 'fastq_screens', None) self.qc['fastq_screen_subset'] = config.getint('qc', 'fastq_screen_subset', 100000) self.qc['use_legacy_screen_names'] = \ config.getboolean( 'qc', 'use_legacy_screen_names', False) # Fastq screens self.add_section('screens') for section in filter(lambda x: x.startswith('screen:'), config.sections()): screen = section.split(':')[1] self.screens[screen] = AttributeDictionary(conf_file=None) self.screens[screen]['conf_file'] = config.get( section, 'conf_file', None) # Organisms self.add_section('organisms') for section in filter(lambda x: x.startswith('organism:'), config.sections()): organism = section.split(':')[1] self.organisms[organism] = self.get_organism_config( section, config) # Handle legacy STAR index specifications (fastq_strand_indexes) try: for organism, index_file in config.items('fastq_strand_indexes'): if organism not in self.organisms: self.organisms[organism] = self.get_organism_config() self['organisms'][organism]['star_index'] = index_file logger.warning("Added STAR index information from " "deprecated 'fastq_strand_indexes' section (use " "'organism:ORGANISM' sections instead)") except NoSectionError: pass # Legacy 10xgenomics transcriptome references try: for organism, reference in config.items( '10xgenomics_transcriptomes'): if organism not in self.organisms: self.organisms[organism] = self.get_organism_config() self['organisms'][organism]['cellranger_reference'] = reference logger.warning("Added cellranger references from deprecated " "'10xgenomics_transcriptomes' section (use " "'organism:ORGANISM' sections instead)") except NoSectionError: pass # Legacy 10xgenomics snRNA-seq pre-mRNA references try: for organism, reference in config.items( '10xgenomics_premrna_references'): if organism not in self.organisms: self.organisms[organism] = self.get_organism_config() self['organisms'][organism][ 'cellranger_premrna_reference'] = reference logger.warning("Added cellranger pre-mRNA references from " "deprecated '10xgenomics_premrna_references' " "section (use 'organism:ORGANISM' sections " "instead)") except NoSectionError: pass # Legacy 10xgenomics scATAC-seq genome references try: for organism, reference in config.items( '10xgenomics_atac_genome_references'): if organism not in self.organisms: self.organisms[organism] = self.get_organism_config() self['organisms'][organism][ 'cellranger_atac_reference'] = reference logger.warning("Added cellranger-atac references from deprecated " "'10xgenomics_atac_genome_references' section " "(use 'organism:ORGANISM' sections instead)") except NoSectionError: pass # Legacy 10xGenomics cellranger ARC single cell multiome references try: for organism, reference in config.items( '10xgenomics_multiome_references'): if organism not in self.organisms: self.organisms[organism] = self.get_organism_config() self['organisms'][organism][ 'cellranger_arc_reference'] = reference logger.warning("Added cellranger-arc references from deprecated " "'10xgenomics_multiome_references' section " "(use 'organism:ORGANISM' sections instead)") except NoSectionError: pass # Sequencers self.add_section('sequencers') for section in filter(lambda x: x.startswith('sequencer:'), config.sections()): instrument = section.split(':')[1] self.sequencers[instrument] = self.get_sequencer_config( section, config) # Add any settings legacy 'sequencers' section try: for instrument, platform in config.items('sequencers'): if instrument not in self.sequencers: self['sequencers'][instrument] = \ AttributeDictionary(platform=None, model=None) self['sequencers'][instrument]['platform'] = platform logger.warning("Added sequencer information from " "deprecated 'sequencers' section (use " "'sequencer:INSTRUMENT' sections " "instead)") except NoSectionError: pass # Sequencing platform-specific defaults self.add_section('platform') for section in filter(lambda x: x.startswith('platform:'), config.sections()): platform = section.split(':')[1] self.platform[platform] = self.get_bcl_converter_config( section, config) # Handle deprecated bcl2fastq settings for platform in ('hiseq', 'miseq', 'nextseq'): if config.has_option('bcl2fastq', platform): logger.warning("Deprecated setting in [bcl2fastq]: '%s'" % platform) try: bcl2fastq = self.platform[platform]['bcl2fastq'] except KeyError: bcl2fastq = config.get('bcl2fastq', platform) if bcl2fastq is None: continue logger.warning( "Setting 'bcl2fastq' in '[platform:%s]' to '%s'" % (platform, bcl2fastq)) if platform not in self.platform: self.platform[platform] = AttributeDictionary() self.platform[platform]['bcl2fastq'] = bcl2fastq # Metadata defaults self.add_section('metadata') self.metadata['default_data_source'] = config.get( 'metadata', 'default_data_source') # icell8 self.add_section('icell8') self.icell8['aligner'] = config.get('icell8', 'aligner') self.icell8['batch_size'] = config.getint('icell8', 'batch_size', 5000000) self.icell8['mammalian_conf_file'] = config.get( 'icell8', 'mammalian_conf_file') self.icell8['contaminants_conf_file'] = config.get( 'icell8', 'contaminants_conf_file') self.icell8['nprocessors_contaminant_filter'] = config.getint( 'icell8', 'nprocessors_contaminant_filter', None) self.icell8['nprocessors_statistics'] = config.getint( 'icell8', 'nprocessors_statistics', None) # 10xgenomics self.add_section('10xgenomics') self['10xgenomics']['cellranger_jobmode'] = config.get( '10xgenomics', 'cellranger_jobmode', 'local') self['10xgenomics']['cellranger_maxjobs'] = config.getint( '10xgenomics', 'cellranger_maxjobs', 24) self['10xgenomics']['cellranger_mempercore'] = config.getint( '10xgenomics', 'cellranger_mempercore', 5) self['10xgenomics']['cellranger_jobinterval'] = config.getint( '10xgenomics', 'cellranger_jobinterval', 100) self['10xgenomics']['cellranger_localmem'] = config.getint( '10xgenomics', 'cellranger_localmem', 5) self['10xgenomics']['cellranger_localcores'] = config.getint( '10xgenomics', 'cellranger_localcores', None) # fastq_stats self.add_section('fastq_stats') self.fastq_stats['nprocessors'] = config.getint( 'fastq_stats', 'nprocessors', None) # Define runners for specific jobs self.add_section('runners') for name in ( 'bcl2fastq', 'bcl_convert', 'qc', 'star', 'stats', 'rsync', 'icell8', 'icell8_contaminant_filter', 'icell8_statistics', 'icell8_report', 'cellranger', ): self.runners[name] = config.getrunner('runners', name, default_runner) # Handle new runners that default to the 'qc' runner for name in ( 'fastqc', 'fastq_screen', 'star', ): self.runners[name] = config.getrunner('runners', name, self.runners.qc) # Information for archiving analyses # dirn should be a directory in the form [[user@]host:]path] self.add_section('archive') self.archive['dirn'] = config.get('archive', 'dirn', None) self.archive['log'] = config.get('archive', 'log', None) self.archive['group'] = config.get('archive', 'group', None) self.archive['chmod'] = config.get('archive', 'chmod', None) # Information for uploading QC reports # dirn should be a directory in the form [[user@]host:]path] self.add_section('qc_web_server') self.qc_web_server['dirn'] = config.get('qc_web_server', 'dirn', None) self.qc_web_server['url'] = config.get('qc_web_server', 'url', None) self.qc_web_server['use_hierarchy'] = config.getboolean( 'qc_web_server', 'use_hierarchy') self.qc_web_server['exclude_zip_files'] = config.getboolean( 'qc_web_server', 'exclude_zip_files') # Templates for reporting project data self.add_section('reporting_templates') try: for template, fields in config.items('reporting_templates'): self['reporting_templates'][template] = fields except NoSectionError: logger.debug("No reporting templates defined") # Destinations for data transfer self.add_section('destination') for section in filter(lambda x: x.startswith('destination:'), config.sections()): dest = section.split(':')[1] self.destination[dest] = self.get_destination_config( section, config)
def get_bcl_converter_config(self, section, config, attr_dict=None): """ Retrieve BCL conversion configuration options from .ini file Given the name of a section (e.g. 'bcl_conversion', 'platform:miseq'), fetch the BCL converter settings and return in an AttributeDictionary object. The options that can be extracted are: - bcl_converter - nprocessors - no_lane_splitting - create_empty_fastqs There are also some legacy options: - default_version - bcl2fastq Arguments: section (str): name of the section to retrieve the settings from config (Config): Config object with settings loaded attr_dict (AttributeDictionary): optional, existing AttributeDictionary which will be added to Returns: AttributeDictionary: dictionary of option:value pairs. """ if attr_dict: values = attr_dict else: values = AttributeDictionary() if section == 'bcl2fastq': # Deprecated [bcl2fastq] section value = config.get(section, 'default_version', None) if value: values['bcl_converter'] = "bcl2fastq%s" % value else: # [bcl_conversion] and [platform:...] sections bcl2fastq = config.get(section, 'bcl2fastq', None) value = config.get(section, 'bcl_converter', None) if value: values['bcl_converter'] = value elif bcl2fastq is not None: values['bcl_converter'] = "bcl2fastq%s" % bcl2fastq elif 'bcl_converter' not in values: values['bcl_converter'] = None # Common settings value = config.getint(section, 'nprocessors', None) if value or 'nprocessors' not in values: values['nprocessors'] = value value = config.getboolean(section, 'no_lane_splitting', None) if value is not None or 'no_lane_splitting' not in values: values['no_lane_splitting'] = value value = config.getboolean(section, 'create_empty_fastqs', None) if value is not None or 'create_empty_fastqs' not in values: values['create_empty_fastqs'] = value return values
def analyse(self, lane=None, sample_sheet=None, cutoff=None, mismatches=0, minimum_read_fraction=0.000001): """ Analyse barcode frequencies Returns a dictionary with the following keys: - barcodes: list of barcodes (or reference barcodes, if mismatches > 0) - cutoff: the specified cutoff fraction - mismatches: the specified number of mismatches to allow - total_reads: the total number of reads for the specified lane (or all reads, if no lane was specified) - coverage: the number of reads after cutoffs have been applied - counts: dictionary with barcodes from the 'barcodes' list as keys; each key points to a dictionary with keys: * reads: number of reads associated with this barcode (or group, if mismatches > 0) * sample: name of the associated sample (if a sample sheet was supplied, otherwise 'None') * sequences: number of sequences in the group (always 1 if mismatches == 0) Arguments: lane (integer): lane to restrict analysis to (None analyses all lanes) sample_sheet (str): sample sheet file to compare barcodes against (None skips comparison) cutoff (float): if mismatches == 0 then barcodes must have at least this fraction of reads to be included; (if mismatches > 0 then this condition is applied to groups instead) mismatches (integer): maximum number of mismatched bases allowed when matching barcodes (default is 0 i.e. exact matches only) minimum_read_fraction: speed-up parameter, excludes barcodes with less than this fraction of associated reads (speeds up the grouping calculation at the cost of some precision) """ sample_lookup = {} if sample_sheet is not None: sample_sheet = SampleSheetBarcodes(sample_sheet) sample_sheet_barcodes = sample_sheet.barcodes(lane) else: sample_sheet_barcodes = None if not mismatches: groups = None barcodes = self.filter_barcodes(cutoff=cutoff, lane=lane) else: groups = self.group(lane, mismatches=mismatches, seed_barcodes=sample_sheet_barcodes, cutoff=cutoff, minimum_read_fraction=minimum_read_fraction) barcodes = [grp.reference for grp in groups] analysis = AttributeDictionary(barcodes=barcodes, cutoff=cutoff, counts=dict(), total_reads=self.nreads(lane=lane), mismatches=mismatches) cum_reads = 0 if groups: for group in groups: barcode = group.reference barcode_reads = group.counts cum_reads += barcode_reads try: # Exact match sample = sample_sheet.lookup_sample(barcode, lane) except KeyError: # Closest match(es) sample = [] for seq in sample_sheet.barcodes(lane): if group.match(seq, mismatches): sample.append(sample_sheet.lookup_sample( seq, lane)) if sample: sample = ','.join(sample) else: sample = None except AttributeError: # No sample sheet sample = None analysis.counts[barcode] = AttributeDictionary( reads=barcode_reads, sample=sample, sequences=len(group)) else: for barcode in barcodes: barcode_reads = self.counts(barcode, lane) cum_reads += barcode_reads try: sample = sample_sheet.lookup_sample(barcode, lane) except (KeyError, AttributeError): sample = None analysis.counts[barcode] = AttributeDictionary( reads=barcode_reads, sample=sample, sequences=1) analysis['coverage'] = cum_reads return analysis
def verify(self,fastqs,qc_protocol,fastq_screens=None, cellranger_version=None,cellranger_refdata=None, cellranger_use_multi_config=None): """ Verify QC outputs for Fastqs against specified protocol Arguments: fastqs (list): list of Fastqs to verify outputs for qc_protocol (str): QC protocol to verify against fastq_screens (list): list of panel names to verify FastqScreen outputs against cellranger_version (str): specific version of 10x package to check for cellranger_refdata (str): specific 10x reference dataset to check for cellranger_use_multi_config (bool): if True then cellranger count verification will attempt to use data (GEX samples and reference dataset) from the '10x_multi_config.csv' file Returns: Boolean: True if all expected outputs are present, False otherwise. """ # Look up protocol definition reads,qc_modules = fetch_protocol_definition(qc_protocol) # Sample names samples = set() for fq in fastqs: samples.add(self.fastq_attrs(fq).sample_name) samples = sorted(list(samples)) # Default parameters for verification default_params = dict( fastqs=fastqs, samples=samples, seq_data_reads=reads.seq_data, qc_reads=reads.qc, fastq_screens=fastq_screens, cellranger_version=cellranger_version, cellranger_refdata=cellranger_refdata, cellranger_use_multi_config=cellranger_use_multi_config ) # Perform verification verified = dict() params_for_module = dict() for qc_module in qc_modules: # Handle QC module specification qc_module,module_params = parse_qc_module_spec(qc_module) # Store parameters for reporting params_for_module[qc_module] = dict(**module_params) # Initialise up parameters for this module params = AttributeDictionary(**default_params) # Override parameters from module definition # parameter list for p in module_params: params[p] = module_params[p] # Verify outputs for this QC module verified[qc_module] = self.verify_qc_module(qc_module, **params) # Report parameters and status of checks parameter_template_str = "{parameter:21s}: {value}" qc_module_template_str = "{name:21s}: {status:4s}{params}" print("-"*(10+len(self.qc_dir))) print("QC dir : %s" % self.qc_dir) print("Protocol: %s" % qc_protocol) print("-"*(10+len(self.qc_dir))) print("Parameters:") for p in default_params: if p == 'fastqs': fqs = ['.../%s' % os.path.basename(fq) for fq in default_params[p]] if not fqs: print(parameter_template_str.format(parameter=p, value='')) else: print(parameter_template_str.format(parameter=p, value=fqs[0])) for fq in fqs[1:]: print(parameter_template_str.format(parameter='', value=fq)) elif p == 'samples': smpls = default_params[p] if not smpls: print(parameter_template_str.format(parameter=p, value='')) else: print(parameter_template_str.format(parameter=p, value=smpls[0])) for smpl in smpls[1:]: print(parameter_template_str.format(parameter='', value=smpl)) elif p == 'cellranger_refdata': refdata = default_params[p] print(parameter_template_str.format( parameter=p, value=('.../%s' % os.path.basename(refdata) if refdata else refdata))) else: print(parameter_template_str.format(parameter=p, value=default_params[p])) print("-"*27) for name in verified: print(qc_module_template_str.format( name=name, status=('PASS' if verified[name] else 'FAIL'), params=(" %s" % params_for_module[name] if params_for_module[name] else ''))) status = all([verified[m] for m in verified]) print("-"*27) print(qc_module_template_str.format( name="QC STATUS", status=('PASS' if status else 'FAIL'), params='')) print("-"*27) # Return verification status return status
if 'Nreads_contaminant_filtered' in cols: contaminant_filtered = True else: logging.warning("No stats on contaminant filtering") contaminant_filtered = False # Rename the '#Barcodes' and '%reads_poly_g' columns df.rename(columns={ '#Barcode': 'Barcode', '%reads_poly_g': 'percent_poly_g' }, inplace=True) print df.head() # Gather the data data = AttributeDictionary() # Total reads data['total_reads'] = df['Nreads'].sum() # Total assigned reads df = df.drop(df[df['Barcode'] == 'Unassigned'].index) data['total_assigned_reads'] = df['Nreads'].sum() # Mean and median reads per barcode data['median_read_count'] = df['Nreads'].median() data['mean_read_count'] = df['Nreads'].mean() data['std_read_count'] = df['Nreads'].std() # Number of barcodes (total and assigned) data['total_barcodes'] = len(df) data['assigned_barcodes'] = len(df[df['Nreads'] > 0])
def __init__(self,settings_file=None): """ Create new Settings instance If 'settings_file' is specified then this should be the full path to an appropriately formatted '.ini' file. Otherwise the class will attempt to locate an appropriate file to use. """ # Initialise list of sections self._sections = [] # Locate settings file if settings_file is None: self.settings_file = locate_settings_file(create_from_sample=False) else: self.settings_file = os.path.abspath(settings_file) # Import site-specific settings from local version config = Config() if self.settings_file: config.read(self.settings_file) else: # Look for sample settings file config.read(os.path.join(get_config_dir(),'settings.ini.sample')) # General parameters self.add_section('general') default_runner = config.get('general','default_runner', 'SimpleJobRunner') self.general['default_runner'] = config.getrunner('general', 'default_runner', 'SimpleJobRunner') self.general['max_concurrent_jobs'] = config.getint('general', 'max_concurrent_jobs',12) # modulefiles self.add_section('modulefiles') self.modulefiles['make_fastqs'] = config.get('modulefiles','make_fastqs') self.modulefiles['run_qc'] = config.get('modulefiles','run_qc') self.modulefiles['process_icell8'] = config.get('modulefiles','process_icell8') # bcl2fastq self.add_section('bcl2fastq') self.bcl2fastq = self.get_bcl2fastq_config('bcl2fastq',config) # qc self.add_section('qc') self.qc['nprocessors'] = config.getint('qc','nprocessors',1) self.qc['fastq_screen_subset'] = config.getint('qc', 'fastq_screen_subset', 100000) # Sequencing platform-specific defaults self.add_section('platform') for section in filter(lambda x: x.startswith('platform:'), config.sections()): platform = section.split(':')[1] self.platform[platform] = self.get_bcl2fastq_config(section,config) # Handle deprecated bcl2fastq settings for platform in ('hiseq','miseq','nextseq'): if config.has_option('bcl2fastq',platform): logging.warning("Deprecated setting in [bcl2fastq]: '%s'" % platform) try: bcl2fastq = self.platform[platform]['bcl2fastq'] except KeyError: bcl2fastq = config.get('bcl2fastq',platform) if bcl2fastq is None: continue logging.warning("Setting 'bcl2fastq' in '[platform:%s]' to '%s'" % (platform,bcl2fastq)) if platform not in self.platform: self.platform[platform] = AttributeDictionary() self.platform[platform]['bcl2fastq'] = bcl2fastq # icell8 self.add_section('icell8') self.icell8['aligner'] = config.get('icell8','aligner') self.icell8['batch_size'] = config.getint('icell8','batch_size',5000000) self.icell8['mammalian_conf_file'] = config.get('icell8', 'mammalian_conf_file') self.icell8['contaminants_conf_file'] = config.get('icell8', 'contaminants_conf_file') self.icell8['nprocessors_contaminant_filter'] = config.getint('icell8','nprocessors_contaminant_filter',1) self.icell8['nprocessors_statistics'] = config.getint('icell8','nprocessors_statistics',1) # 10xgenomics self.add_section('10xgenomics') self['10xgenomics']['cellranger_jobmode'] = config.get('10xgenomics', 'cellranger_jobmode', 'sge') self['10xgenomics']['cellranger_mempercore'] = config.getint('10xgenomics','cellranger_mempercore',5) self['10xgenomics']['cellranger_jobinterval'] = config.getint('10xgenomics','cellranger_jobinterval',100) # fastq_stats self.add_section('fastq_stats') self.fastq_stats['nprocessors'] = config.getint('fastq_stats','nprocessors',1) # Define runners for specific jobs self.add_section('runners') for name in ('bcl2fastq', 'qc', 'stats', 'rsync', 'icell8', 'icell8_contaminant_filter', 'icell8_statistics',): self.runners[name] = config.getrunner('runners',name, default_runner) # Information for archiving analyses # dirn should be a directory in the form [[user@]host:]path] self.add_section('archive') self.archive['dirn'] = config.get('archive','dirn',None) self.archive['log'] = config.get('archive','log',None) self.archive['group'] = config.get('archive','group',None) self.archive['chmod'] = config.get('archive','chmod',None) # Information for uploading QC reports # dirn should be a directory in the form [[user@]host:]path] self.add_section('qc_web_server') self.qc_web_server['dirn'] = config.get('qc_web_server','dirn',None) self.qc_web_server['url'] = config.get('qc_web_server','url',None) self.qc_web_server['use_hierarchy'] = config.getboolean( 'qc_web_server','use_hierarchy') self.qc_web_server['exclude_zip_files'] = config.getboolean( 'qc_web_server','exclude_zip_files')
def args(self): """ Fetch parameters supplied to the instance """ return AttributeDictionary(**self._callargs)