def __init__(self, config, barcode_map=None): VariantSeqLib.__init__(self, config) BarcodeSeqLib.__init__(self, config, barcodevariant=True) try: if 'map file' in config['barcodes']: self.barcode_map = BarcodeMap(config['barcodes']['map file']) else: self.barcode_map = None self.set_filters( config['filters'], { 'min quality': 0, 'avg quality': 0, 'chastity': False, 'max mutations': len(self.wt_dna) }) except KeyError as key: raise EnrichError("Missing required config value %s" % key, self.name) if self.barcode_map is None: # not in local config if barcode_map is None: # not provided on object creation raise EnrichError("Barcode map not specified", self.name) else: self.barcode_map = barcode_map self.counts['barcodes_unmapped'] = None self.filter_unmapped = True
def __init__(self, config): VariantSeqLib.__init__(self, config) try: self.forward = config['fastq']['forward'] self.reverse = config['fastq']['reverse'] self.fwd_start = int(config['overlap']['forward start']) self.rev_start = int(config['overlap']['reverse start']) self.overlap_length = int(config['overlap']['length']) self.trim = config['overlap']['overlap only'] self.max_overlap_mismatches = int(config['overlap'] ['max mismatches']) if 'merge failure' in config['filters']: raise EnrichError("'merge failure' is not user-configurable", self.name) self.set_filters(config['filters'], {'remove unresolvable' : False, 'min quality' : 0, 'avg quality' : 0, 'max mutations' : len(self.wt_dna), 'chastity' : False, 'merge failure' : True}) except KeyError as key: raise EnrichError("Missing required config value %s" % key, self.name) except ValueError as value: raise EnrichError("Invalid parameter value %s" % value, self.name) try: check_fastq(self.forward) check_fastq(self.reverse) except IOError as fqerr: raise EnrichError("FASTQ file error: %s" % fqerr, self.name)
def dump_data(self): """ Save the :py:class:`pandas.DataFrame` objects as tab-separated files and set the data to ``None`` to save memory. The file names are stored for use by :py:meth:`restore_data`. """ for key in self.df_dict.keys(): try: output_dir = os.path.join(self.output_base, "dump", fix_filename(self.name)) except AttributeError: raise EnrichError("No output directory specified for object", self.name) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except OSError: raise EnrichError("Failed to create dump directory", self.name) fname = os.path.join(output_dir, fix_filename(key + ".tsv")) self.df_dict[key].to_csv(fname, sep="\t", na_rep="NaN", float_format="%.4g", index_label="sequence") self.df_file[key] = fname self.df_dict[key] = None
def __init__(self, config): DataContainer.__init__(self, config) try: self.timepoint = int(config['timepoint']) except KeyError as key: raise EnrichError("Missing required config value '%s'" % key, self.name) except ValueError as value: raise EnrichError("Invalid parameter value %s" % value, self.name) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() else: self.aligner = None else: self.aligner = None if 'report filtered reads' in config: self.report_filtered_reads = config['report filtered reads'] else: self.report_filtered_reads = self.verbose # initialize data self.counts = dict() # pandas dataframes self.counts_file = dict() # paths to saved counts self.filters = None # dictionary self.filter_stats = None # dictionary
def report_filter_stats(self): try: output_dir = os.path.join(self.output_base, fix_filename(self.name)) except AttributeError: raise EnrichError("Invalid output directory specified for object", self.name) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except OSError: raise EnrichError("Failed to create output directory", self.name) with open(os.path.join(output_dir, "filter_stats.txt"), "w") as handle: elements = list() for key in sorted(self.filter_stats, key=self.filter_stats.__getitem__, reverse=True): if key != 'total': print(DataContainer._filter_messages[key], self.filter_stats[key], sep="\t", file=handle) print('total', self.filter_stats['total'], sep="\t", file=handle) logging.info( "Wrote filtering statistics [{name}]".format(name=self.name))
def write_data(self, directory=None, keys=None): """ Save the :py:class:`pandas.DataFrame` objects as tab-separated files in a new subdirectory of *directory* with the same name as the object. If *directory* is ``None``, files will be saved to the object's default output directory. The optional *keys* parameter is a list of types of counts to be saved. By default, all counts are saved. """ if keys is None: keys = self.df_dict.keys() for key in keys: try: output_dir = os.path.join(self.output_base, fix_filename(self.name)) except AttributeError: raise EnrichError( "Invalid output directory specified for object", self.name) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except OSError: raise EnrichError("Failed to create output directory", self.name) fname = os.path.join(output_dir, fix_filename(key + ".tsv")) self.df_dict[key].to_csv(fname, sep="\t", na_rep="NaN", float_format="%.4g", index_label="sequence")
def __init__(self, config): VariantSeqLib.__init__(self, config) try: if 'forward' in config['fastq'] and 'reverse' in config['fastq']: raise EnrichError("Multiple FASTQ files specified", self.name) elif 'forward' in config['fastq']: self.reads = config['fastq']['forward'] self.revcomp_reads = False elif 'reverse' in config['fastq']: self.reads = config['fastq']['reverse'] self.revcomp_reads = True else: raise KeyError("'forward' or 'reverse'") self.set_filters( config['filters'], { 'min quality': 0, 'avg quality': 0, 'chastity': False, 'max mutations': len(self.wt_dna) }) except KeyError as key: raise EnrichError("missing required config value: %s" % key, self.name) try: check_fastq(self.reads) except IOError as fqerr: raise EnrichError("FASTQ file error: %s" % fqerr, self.name)
def __init__(self, config, parent=True): if parent: SeqLib.__init__(self, config) self.wt_dna = None self.wt_protein = None self.aligner = None self.aligner_cache = None try: self.set_wt(config['wild type']['sequence'], coding=config['wild type']['coding']) if 'align variants' in config: if config['align variants']: self.aligner = Aligner() self.aligner_cache = dict() except KeyError as key: raise EnrichError( "Missing required config value '{key}'".format(key), self.name) if 'reference offset' in config['wild type']: try: self.reference_offset = int( config['wild type']['reference offset']) except ValueError: raise EnrichError("Invalid reference offset value", self.name) else: self.reference_offset = 0 self.df_dict['variants'] = None
def __init__(self, config): DataContainer.__init__(self, config) self.conditions = dict() self.control = None self.use_scores = True self.normalize_wt = False try: if 'normalize wt' in config: if config['normalize wt'] is True: self.normalize_wt = True for cnd in config['conditions']: if not cnd['label'].isalnum(): raise EnrichError( "Alphanumeric label required for condition '{label}'". format(label=cnd['label']), self.name) for sel_config in cnd[ 'selections']: # assign output base if not present if 'output directory' not in sel_config: sel_config['output directory'] = self.output_base if cnd['label'] not in self.conditions: self.conditions[cnd['label']] = [ selection.Selection(x) for x in cnd['selections'] ] else: raise EnrichError( "Non-unique condition label '{label}'".format( label=cnd['label']), self.name) if 'control' in cnd: if cnd['control']: if self.control is None: self.control = self.conditions[cnd['label']] else: raise EnrichError("Multiple control conditions", self.name) except KeyError as key: raise EnrichError( "Missing required config value {key}".format(key=key), self.name) all_selections = list() for key in self.conditions: all_selections.extend(self.conditions[key]) for dtype in all_selections[0].df_dict: if all(dtype in x.df_dict for x in all_selections): self.df_dict[dtype] = True if len(self.df_dict.keys()) == 0: raise EnrichError( "No enrichment data present across all selections", self.name) # ensure consistency for score usage if not all(x.use_scores for x in all_selections): self.use_scores = False # ensure consistency for wild type normalization for sel in all_selections: sel.normalize_wt = self.normalize_wt
def __init__(self, mapfile): self.name = "barcodemap_{fname}".format( fname=os.path.basename(mapfile)) self.filename = mapfile self.variants = dict() self.bc_variant_strings = dict() try: ext = os.path.splitext(mapfile)[-1].lower() if ext in (".bz2"): handle = bz2.BZ2File(mapfile, "rU") elif ext in (".gz"): handle = gzip.GzipFile(mapfile, "rU") else: handle = open(mapfile, "rU") except IOError: raise EnrichError( "Could not open barcode map file '{fname}'".format( fname=mapfile), self.name) for line in handle: # skip comments and whitespace-only lines if len(line.strip()) == 0 or line[0] == '#': continue try: barcode, variant = line.strip().split() except ValueError: raise EnrichError("Unexpected barcode-variant line format", self.name) if not re.match("^[ACGTacgt]+$", barcode): raise EnrichError( "Barcode DNA sequence contains unexpected " "characters", self.name) if not re.match("^[ACGTNacgtn]+$", variant): raise EnrichError( "Variant DNA sequence contains unexpected " "characters", self.name) barcode = barcode.upper() variant = variant.upper() if barcode in self: if self[barcode] != variant: raise EnrichError( "Barcode '{bc}' assigned to multiple unique variants". format(bc=barcode), self.name) else: self[barcode] = variant handle.close()
def __init__(self, mapfile): self.name = "mapfile_%s" % mapfile try: handle = open(mapfile, "U") except IOError: raise EnrichError("Could not open barcode map file '%s'" \ % mapfile, self.name) self.filename = mapfile for line in handle: # skip comments and whitespace-only lines if len(line.strip()) == 0 or line[0] == '#': continue try: barcode, variant = line.strip().split() except ValueError: raise EnrichError("Unexpected barcode-variant line format", self.name) if not re.match("^[ACGTacgt]+$", barcode): raise EnrichError( "Barcode DNA sequence contains unexpected " "characters", self.name) if not re.match("^[ACGTNacgtn]+$", variant): raise EnrichError( "Variant DNA sequence contains unexpected " "characters", self.name) barcode = barcode.upper() variant = variant.upper() if barcode in self: if self[barcode] != variant: raise EnrichError( "Barcode '%s' assigned to multiple " "unique variants" % barcode, self.name) else: self[barcode] = variant handle.close() # build the variants dictionary self.variants = dict() for bc in self.keys(): if self[bc] not in self.variants: self.variants[self[bc]] = list() self.variants[self[bc]].append(bc) logging.info("Assigned %d barcodes to %d variants [%s]" % \ (len(self.keys()), len(self.variants.keys()), self.name))
def __init__(self, config, barcodevariant=False): self.barcodevariant = barcodevariant if not self.barcodevariant: SeqLib.__init__(self, config) try: if 'forward' in config['fastq'] and 'reverse' in config['fastq']: raise EnrichError("Multiple FASTQ files specified", self.name) elif 'forward' in config['fastq']: self.reads = config['fastq']['forward'] self.revcomp_reads = False elif 'reverse' in config['fastq']: self.reads = config['fastq']['reverse'] self.revcomp_reads = True else: raise KeyError("'forward' or 'reverse'") if 'start' in config['fastq']: self.bc_start = config['fastq']['start'] else: self.bc_start = 1 if 'length' in config['fastq']: self.bc_length = config['fastq']['length'] else: self.bc_length = 2147483647 # longer than any read... for now if 'min count' in config['barcodes']: self.min_count = config['barcodes']['min count'] else: self.min_count = 0 self.set_filters(config['filters'], { 'min quality': 0, 'avg quality': 0, 'chastity': False }) except KeyError as key: raise EnrichError( "Missing required config value {key}".format(key=key), self.name) try: check_fastq(self.reads) except IOError as fqerr: raise EnrichError("FASTQ file error: {error}".format(error=fqerr), self.name) self.df_dict['barcodes'] = None if self.min_count > 0: self.df_dict['barcodes_low_abundance'] = None
def __init__(self, config): DataContainer.__init__(self, config) try: self.timepoint = int(config['timepoint']) except KeyError as key: raise EnrichError("Missing required config value '{key}'".format(key=key), self.name) except ValueError as value: raise EnrichError("Invalid parameter value {value}".format(value=value), self.name) if 'report filtered reads' in config: self.report_filtered = config['report filtered reads'] else: self.report_filtered = False
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the variants. """ self.df_dict['variants'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False logging.info("Counting variants [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the read based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if not any(filter_flags.values()): # passed quality filtering mutations = self.count_variant(fq.sequence) if mutations is None: # read has too many mutations self.filter_stats['max mutations'] += 1 filter_flags['max mutations'] = True if any(filter_flags.values()): self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) self.df_dict['variants'] = \ pd.DataFrame.from_dict(self.df_dict['variants'], orient="index", dtype="int32") if len(self.df_dict['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.df_dict['variants'].columns = ['count'] self.df_dict['variants'].sort('count', ascending=False, inplace=True) logging.info("Counted {n} variants ({u} unique) [{name}]".format( n=self.df_dict['variants']['count'].sum(), u=len(self.df_dict['variants'].index), name=self.name)) if self.aligner is not None: logging.info("Aligned {n} variants [{name}]".format( n=self.aligner.calls, name=self.name)) self.aligner_cache = None self.report_filter_stats()
def calculate(self): """ Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into variant counts using the :py:class:`BarcodeMap`. """ BarcodeSeqLib.calculate(self) # count the barcodes self.df_dict['variants'] = dict() logging.info( "Converting barcodes to variants [{name}]".format(name=self.name)) if self.filter_unmapped: map_mask = self.df_dict['barcodes'].index.isin(self.barcode_map) self.df_dict['barcodes_unmapped'] = self.df_dict['barcodes'][ -map_mask] self.df_dict['barcodes'] = self.df_dict['barcodes'][map_mask] del map_mask logging.info( "Writing counts for {n} unique unmapped barcodes to disk [{name}]" .format(n=len(self.df_dict['barcodes_unmapped']), name=self.name)) self.dump_data(keys=['barcodes_unmapped']) # save memory # count variants associated with the barcodes for bc, count in self.df_dict['barcodes'].iterrows(): count = count['count'] variant = self.barcode_map[bc] mutations = self.count_variant(variant, copies=count) if mutations is None: # variant has too many mutations self.filter_stats['max mutations'] += count self.filter_stats['total'] += count if self.report_filtered: self.report_filtered_variant(variant, count) if bc not in self.barcode_map.bc_variant_strings: self.barcode_map.bc_variant_strings[bc] = FILTERED_VARIANT else: if mutations not in self.barcode_map.variants: self.barcode_map.variants[mutations] = set() self.barcode_map.variants[mutations].update([bc]) self.barcode_map.bc_variant_strings[bc] = mutations self.df_dict['variants'] = \ pd.DataFrame.from_dict(self.df_dict['variants'], orient="index", dtype="int32") if len(self.df_dict['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.df_dict['variants'].columns = ['count'] self.df_dict['variants'].sort('count', ascending=False, inplace=True) logging.info( "Retained counts for {n} variants ({u} unique) [{name}]".format( n=self.df_dict['variants']['count'].sum(), u=len(self.df_dict['variants'].index), name=self.name)) if self.aligner is not None: logging.info("Aligned {n} variants [{name}]".format( n=self.aligner.calls, name=self.name)) self.aligner_cache = None self.report_filter_stats()
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.counts['barcodes'] = dict() # flags for verbose output of filtered reads filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats['total'] += 1 if self.verbose: self.report_filtered_read(fq, filter_flags) else: # passed quality filtering try: self.counts['barcodes'][fq.sequence.upper()] += 1 except KeyError: self.counts['barcodes'][fq.sequence.upper()] = 1 self.counts['barcodes'] = \ pd.DataFrame.from_dict(self.counts['barcodes'], orient="index", dtype="int32") if len(self.counts['barcodes']) == 0: raise EnrichError("Failed to count barcodes", self.name) self.counts['barcodes'].columns = ['count'] self.counts['barcodes'] = \ self.counts['barcodes'][self.counts['barcodes']['count'] \ > self.min_count] logging.info("Counted %d barcodes (%d unique) [%s]" % \ (self.counts['barcodes']['count'].sum(), len(self.counts['barcodes'].index), self.name)) if not self.barcodevariant: self.report_filter_stats()
def __init__(self, config): DataContainer.__init__(self, config) self.conditions = dict() self.control = None self.use_scores = True try: for cnd in config['conditions']: if not cnd['label'].isalnum(): raise EnrichError( "Alphanumeric label required for condition '%s'" % cnd['label'], self.name) for sel_config in cnd[ 'selections']: # assign output base if not present if 'output directory' not in sel_config: sel_config['output directory'] = self.output_base self.conditions[cnd['label']] = [ selection.Selection(x) for x in cnd['selections'] ] if cnd['control']: if self.control is None: self.control = self.conditions[cnd['label']] else: raise EnrichError("Multiple control conditions", self.name) except KeyError as key: raise EnrichError("Missing required config value %s" % key, self.name) all_selections = list() for key in self.conditions: all_selections.extend(self.conditions[key]) for dtype in all_selections[0].df_dict: if all(dtype in x.df_dict for x in all_selections): self.df_dict[dtype] = True if len(self.df_dict.keys()) == 0: raise EnrichError( "No enrichment data present across all selections", self.name) for key in self.conditions: if any(len(x.timepoints) == 2 for x in self.conditions[key]): self.use_scores = False
def set_output_base(self, dirname): """ Sets the object's base output directory (used for :py:meth:`dump_data` and other class-specific methods) to *dirname* and creates the directory if it doesn't exist. """ try: if not os.path.exists(dirname): os.makedirs(dirname) except OSError: raise EnrichError("Failed to create output directory", self.name) self.output_base = dirname
def set_wt(self, sequence, coding=True): """ Set the wild type DNA *sequence*. The *sequence* is translated if *coding* is ``True``. The *sequence* may only contain ``ACGT``, but may contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame. """ sequence = "".join(sequence.split()) # remove whitespace if not re.match("^[ACGTacgt]+$", sequence): raise EnrichError( "WT DNA sequence contains unexpected " "characters", self.name) if len(sequence) % 3 != 0 and coding: raise EnrichError("WT DNA sequence contains incomplete codons", self.name) self.wt_dna = sequence.upper() if coding: self.wt_protein = "" for i in xrange(0, len(self.wt_dna), 3): self.wt_protein += codon_table[self.wt_dna[i:i + 3]] else: self.wt_protein = None
def __init__(self, config): self.name = "Unnamed" + self.__class__.__name__ self.df_dict = dict() self.df_files = dict() self.filters = None self.filter_stats = None self.output_base = None try: self.name = config['name'] except KeyError as key: raise EnrichError( "Missing required config value {key}".format(key=key), self.name) if 'output directory' in config: self.set_output_base(config['output directory'])
def write_variants(self, fname): """ Write a list of barcodes for each variant to the file *fname*. """ try: handle = open(fname, "w") except IOError: raise EnrichError( "Could not open variant barcode map file '%s' " "for writing" % fname, self.name) for variant, barcodes in \ sorted(self.variants.items(), key=lambda x:x[1]): print(variant, ", ".join(barcodes), sep="\t", file=handle) handle.close() logging.info('Wrote BarcodeMap variants file "%s" [%s]' % (fname, self.name))
def calculate(self): """ Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into variant counts using the :py:class:`BarcodeMap`. """ BarcodeSeqLib.calculate(self) # count the barcodes self.counts['variants'] = dict() if self.filter_unmapped: map_mask = self.counts['barcodes'].index.isin(self.barcode_map) self.counts['barcodes_unmapped'] = self.counts['barcodes'][ -map_mask] self.counts['barcodes'] = self.counts['barcodes'][map_mask] del map_mask # count variants associated with the barcodes for bc, count in self.counts['barcodes'].iterrows(): count = count['count'] variant = self.barcode_map[bc] mutations = self.count_variant(variant, copies=count) if mutations is None: # variant has too many mutations self.filter_stats['max mutations'] += count self.filter_stats['total'] += count if self.verbose: self.report_filtered_variant(variant, count) else: if mutations not in self.barcode_map.variants: self.barcode_map.variants[mutations] = list() if bc not in self.barcode_map.variants[mutations]: self.barcode_map.variants[mutations].append(bc) self.counts['variants'] = \ pd.DataFrame.from_dict(self.counts['variants'], orient="index", dtype="int32") if len(self.counts['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.counts['variants'].columns = ['count'] logging.info("Counted %d variants (%d unique) [%s]" % \ (self.counts['variants']['count'].sum(), len(self.counts['variants'].index), self.name)) if self.aligner is not None: logging.info("Aligned %d variants [%s]" % (self.aligner.calls, self.name)) self.report_filter_stats()
def write_data(self, subdirectory=None, keys=None): """ Save the :py:class:`pandas.DataFrame` objects as tab-separated files with the same name as the object. The optional *keys* parameter is a list of types of data to be saved (variant, barcode, etc.). By default, all data are saved. Returns a dictionary with *keys* as the keys and corresponding filenames for the ``.tsv`` files as the values. This dictionary is required by :py:meth:`dump_data` """ fname_dict = dict() if subdirectory is not None: directory = os.path.join(self.output_base, fix_filename(subdirectory), fix_filename(self.name)) else: directory = os.path.join(self.output_base, fix_filename(self.name)) if keys is None: keys = self.df_dict.keys() keys = [k for k in keys if self.df_dict[k] is not None] for key in keys: try: if not os.path.exists(directory): os.makedirs(directory) except OSError: raise EnrichError("Failed to create output directory", self.name) fname = os.path.join(directory, fix_filename(key + ".tsv")) self.df_dict[key].to_csv(fname, sep="\t", na_rep="NaN", float_format="%.4g", index_label="sequence") fname_dict[key] = fname logging.info("Successfully wrote data frames ({keys}) [{name}]".format( name=self.name, keys=", ".join(keys))) return fname_dict
def __init__(self, config): DataContainer.__init__(self, config) self.libraries = dict() self.timepoints = list() try: if 'barcodes' in config: if 'map file' in config['barcodes']: self.barcode_map = BarcodeMap( config['barcodes']['map file']) else: self.barcode_map = None else: self.barcode_map = None libnames = list() for lib in config['libraries']: if 'output directory' not in lib: lib['output directory'] = self.output_base libtype = seqlib_type(lib) if libtype is None: raise EnrichError("Unrecognized SeqLib config", self.name) elif libtype == "BarcodeVariantSeqLib": new = BarcodeVariantSeqLib(lib, barcode_map=self.barcode_map) else: new = globals()[libtype](lib) if new.output_base is None: new.set_output_base(self.output_base) if new.timepoint not in self.libraries: self.libraries[new.timepoint] = list() self.libraries[new.timepoint].append(new) libnames.append(new.name) self.timepoints = sorted(self.libraries.keys()) if len(set(libnames)) != len(libnames): raise EnrichError("Non-unique library names", self.name) self.set_filters( config['filters'], { 'min count': 0, 'min input count': 0, 'min rsquared': 0.0, 'max barcode variation': None }) if 'carryover correction' in config: if config['carrover correction']['method'] == "nonsense": self.ns_carryover_fn = nonsense_ns_carryover_apply_fn self.ns_carryover_kwargs = { 'position': int(config['carryover correction']['position']) } # add additional methods here using "elif" blocks else: raise EnrichError( "Unrecognized nonspecific carryover correction", self.name) else: self.ns_carryover_fn = None self.ns_carryover_kwargs = None except KeyError as key: raise EnrichError("Missing required config value %s" % key, self.name) except ValueError as value: raise EnrichError("Invalid parameter value %s" % value, self.name) if len(self.libraries.keys()) < 2: raise EnrichError("Insufficient number of timepoints", self.name) if 0 not in self.timepoints: raise EnrichError("Missing timepoint 0", self.name) if self.timepoints[0] != 0: raise EnrichError("Invalid negative timepoint", self.name) # identify what kind of counts data is present in all timepoints dtype_counts = list() for tp in self.timepoints: for lib in self.libraries[tp]: dtype_counts.extend(lib.counts.keys()) dtype_counts = Counter(dtype_counts) for dtype in dtype_counts: if dtype_counts[dtype] == len(config['libraries']): self.df_dict[dtype] = True if 'barcodes_unmapped' in self.df_dict.keys( ): # special case for BarcodeVariantSeqLib del self.df_dict['barcodes_unmapped'] if len(self.df_dict.keys()) == 0: raise EnrichError("No count data present across all timepoints", self.name) try: if 'correction' in config: if config['correction']['method'] == "stop": if not self.libraries[0].is_coding(): raise EnrichError( "Invalid correction method for " "noncoding sequences", self.name) else: config['correction']['length percentile'] # must exist self.correction = config['correction'] else: self.correction = None except KeyError as key: raise EnrichError("Missing required config value %s" % key, self.name)
def calculate(self): """ Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented), performs quality-based filtering, and counts the barcodes. """ self.df_dict['barcodes'] = dict() filter_flags = dict() for key in self.filters: filter_flags[key] = False # count all the barcodes logging.info("Counting barcodes [{name}]".format(name=self.name)) for fq in read_fastq(self.reads): fq.trim_length(self.bc_length, start=self.bc_start) if self.revcomp_reads: fq.revcomp() for key in filter_flags: filter_flags[key] = False # filter the barcode based on specified quality settings if self.filters['chastity']: if not fq.is_chaste(): self.filter_stats['chastity'] += 1 filter_flags['chastity'] = True if self.filters['min quality'] > 0: if fq.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if fq.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if any(filter_flags.values()): # failed quality filtering self.filter_stats['total'] += 1 if self.report_filtered: self.report_filtered_read(fq, filter_flags) else: # passed quality filtering try: self.df_dict['barcodes'][fq.sequence.upper()] += 1 except KeyError: self.df_dict['barcodes'][fq.sequence.upper()] = 1 self.df_dict['barcodes'] = \ pd.DataFrame.from_dict(self.df_dict['barcodes'], orient="index", dtype="int32") if len(self.df_dict['barcodes']) == 0: raise EnrichError("Failed to count barcodes", self.name) self.df_dict['barcodes'].columns = ['count'] self.df_dict['barcodes'].sort('count', ascending=False, inplace=True) if 'barcodes_low_abundance' in self.df_dict: # min count is set self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][ self.df_dict['barcodes']['count'] < self.min_count] logging.info( "Writing counts for {n} unique low-abundance barcodes to disk [{name}]" .format(n=len(self.df_dict['barcodes_low_abundance']), name=self.name)) self.dump_data(keys=['barcodes_low_abundance']) self.df_dict['barcodes'] = self.df_dict['barcodes'][ self.df_dict['barcodes']['count'] >= self.min_count] logging.info( "Retained counts for {n} barcodes ({u} unique) [{name}]".format( n=self.df_dict['barcodes']['count'].sum(), u=len(self.df_dict['barcodes'].index), name=self.name)) if not self.barcodevariant: self.report_filter_stats()
action="store_true", default=False, dest="report_filtered", help="output filtered reads to log file") parser.add_argument("--no-plots", help="don't make plots", dest="plots", action="store_false", default=True) args = parser.parse_args() if args.report_filtered: log_level = logging.DEBUG if args.log is None: raise EnrichError( "Cannot report filtered reads without a log file", _DRIVER_NAME) else: log_level = logging.INFO if args.log: logging.basicConfig(filename=args.log, level=log_level) try: config = json.load(open(args.config, "U")) except IOError: raise EnrichError('Failed to open "%s"' % args.config, _DRIVER_NAME) except ValueError: raise EnrichError("Improperly formatted .json file", _DRIVER_NAME) if config_check.is_experiment(config):
def calculate(self): """ Reads the forward and reverse reads, merges them, performs quality-based filtering, and counts the variants. """ self.counts['variants'] = dict() # flags for verbose output of filtered reads filter_flags = dict() for key in self.filters: filter_flags[key] = False for fwd, rev in read_fastq_multi([self.forward, self.reverse]): for key in filter_flags: filter_flags[key] = False # filter the read based on specified quality settings if self.filters['chastity']: if not fwd.is_chaste(): filter_flags['chastity'] = True if self.verbose: self.report_filtered_read(fwd, filter_flags) if not rev.is_chaste(): filter_flags['chastity'] = True if self.verbose: self.report_filtered_read(rev, filter_flags) if filter_flags['chastity']: self.filter_stats['chastity'] += 1 self.filter_stats['total'] += 1 continue merge = self.merge_reads(fwd, rev) if merge is None: # merge failed self.filter_stats['merge failure'] += 1 self.filter_stats['total'] += 1 filter_flags['merge failure'] = True if self.verbose: self.report_filtered_read(fwd, filter_flags) self.report_filtered_read(rev, filter_flags) else: if self.filters['remove unresolvable']: if 'X' in merge.sequence: self.filter_stats['remove unresolvable'] += 1 filter_flags['remove unresolvable'] = True if self.filters['min quality'] > 0: if merge.min_quality() < self.filters['min quality']: self.filter_stats['min quality'] += 1 filter_flags['min quality'] = True if self.filters['avg quality'] > 0: if merge.mean_quality() < self.filters['avg quality']: self.filter_stats['avg quality'] += 1 filter_flags['avg quality'] = True if not any(filter_flags.values()): # passed quality filtering mutations = self.count_variant(merge.sequence) if mutations is None: # merge read has too many mutations self.filter_stats['max mutations'] += 1 filter_flags['max mutations'] = True if any(filter_flags.values()): self.filter_stats['total'] += 1 if self.verbose: self.report_filtered_read(merge, filter_flags) self.counts['variants'] = \ pd.DataFrame.from_dict(self.counts['variants'], orient="index", dtype="int32") if len(self.counts['variants']) == 0: raise EnrichError("Failed to count variants", self.name) self.counts['variants'].columns = ['count'] logging.info("Counted %d variants (%d unique) [%s]" % \ (self.counts['variants']['count'].sum(), len(self.counts['variants'].index), self.name)) if self.aligner is not None: logging.info("Aligned %d variants [%s]" % (self.aligner.calls, self.name)) self.report_filter_stats()
def count_variant(self, variant_dna, copies=1, include_indels=True): """ Identifies mutations and counts the *variant_dna* sequence. The algorithm attempts to call variants by comparing base-by-base. If the *variant_dna* and wild type DNA are different lengths, or if there are an excess of mismatches (indicating a possible indel), local alignment is performed using :py:meth:`align_variant` if this option has been selected in the configuration. Each variant is stored as a tab-delimited string of mutations in HGVS format. Returns a list of HGSV variant strings. Returns an empty list if the variant is wild type. Returns None if the variant was discarded due to excess mismatches. """ if not re.match("^[ACGTNXacgtnx]+$", variant_dna): raise EnrichError( "Variant DNA sequence contains unexpected " "characters", self.name) variant_dna = variant_dna.upper() if len(variant_dna) != len(self.wt_dna): if self.aligner is not None: mutations = self.align_variant(variant_dna) else: return None else: mutations = list() for i in xrange(len(variant_dna)): if variant_dna[i] != self.wt_dna[i]: mutations.append( (i, "{pre}>{post}".format(pre=self.wt_dna[i], post=variant_dna[i]))) if len(mutations) > self.filters['max mutations']: if self.aligner is not None: mutations = self.align_variant(variant_dna) if len(mutations) > self.filters['max mutations']: # too many mutations post-alignment return None else: # stop looping over this variant break else: # too many mutations and not using aligner return None mutation_strings = list() if self.is_coding(): variant_protein = "" for i in xrange(0, len(variant_dna), 3): try: variant_protein += codon_table[variant_dna[i:i + 3]] except KeyError: # garbage codon due to indel variant_protein += '?' for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 ref_pro_pos = (pos + self.reference_offset) / 3 + 1 mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change) if has_indel(change): mut += " (p.{pre}{pos}fs)".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos) elif variant_protein[pos / 3] == self.wt_protein[pos / 3]: mut += " (p.=)" else: mut += " (p.{pre}{pos}{post})".format( pre=aa_codes[self.wt_protein[pos / 3]], pos=ref_pro_pos, post=aa_codes[variant_protein[pos / 3]]) mutation_strings.append(mut) else: for pos, change in mutations: ref_dna_pos = pos + self.reference_offset + 1 mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change) mutation_strings.append(mut) if len(mutation_strings) > 0: variant_string = ', '.join(mutation_strings) else: variant_string = WILD_TYPE_VARIANT try: self.df_dict['variants'][variant_string] += copies except KeyError: self.df_dict['variants'][variant_string] = copies return variant_string