Ejemplo n.º 1
0
    def __init__(self, config, barcode_map=None):
        VariantSeqLib.__init__(self, config)
        BarcodeSeqLib.__init__(self, config, barcodevariant=True)
        try:
            if 'map file' in config['barcodes']:
                self.barcode_map = BarcodeMap(config['barcodes']['map file'])
            else:
                self.barcode_map = None

            self.set_filters(
                config['filters'], {
                    'min quality': 0,
                    'avg quality': 0,
                    'chastity': False,
                    'max mutations': len(self.wt_dna)
                })
        except KeyError as key:
            raise EnrichError("Missing required config value %s" % key,
                              self.name)

        if self.barcode_map is None:  # not in local config
            if barcode_map is None:  # not provided on object creation
                raise EnrichError("Barcode map not specified", self.name)
            else:
                self.barcode_map = barcode_map

        self.counts['barcodes_unmapped'] = None
        self.filter_unmapped = True
Ejemplo n.º 2
0
    def __init__(self, config):
        VariantSeqLib.__init__(self, config)
        try:
            self.forward = config['fastq']['forward']
            self.reverse = config['fastq']['reverse']

            self.fwd_start = int(config['overlap']['forward start'])
            self.rev_start = int(config['overlap']['reverse start'])
            self.overlap_length = int(config['overlap']['length'])
            self.trim = config['overlap']['overlap only']
            self.max_overlap_mismatches = int(config['overlap']
                                                    ['max mismatches'])

            if 'merge failure' in config['filters']:
                raise EnrichError("'merge failure' is not user-configurable", 
                                  self.name)
            self.set_filters(config['filters'], {'remove unresolvable' : False, 
                                      'min quality' : 0,
                                      'avg quality' : 0,
                                      'max mutations' : len(self.wt_dna),
                                      'chastity' : False,
                                      'merge failure' : True})
        except KeyError as key:
            raise EnrichError("Missing required config value %s" % key, 
                              self.name)
        except ValueError as value:
            raise EnrichError("Invalid parameter value %s" % value, self.name)

        try:
            check_fastq(self.forward)
            check_fastq(self.reverse)
        except IOError as fqerr:
            raise EnrichError("FASTQ file error: %s" % fqerr, self.name)
Ejemplo n.º 3
0
 def dump_data(self):
     """
     Save the :py:class:`pandas.DataFrame` objects as tab-separated files and 
     set the data to ``None`` to save memory. The 
     file names are stored for use by :py:meth:`restore_data`.
     """
     for key in self.df_dict.keys():
         try:
             output_dir = os.path.join(self.output_base, "dump",
                                       fix_filename(self.name))
         except AttributeError:
             raise EnrichError("No output directory specified for object",
                               self.name)
         try:
             if not os.path.exists(output_dir):
                 os.makedirs(output_dir)
         except OSError:
             raise EnrichError("Failed to create dump directory", self.name)
         fname = os.path.join(output_dir, fix_filename(key + ".tsv"))
         self.df_dict[key].to_csv(fname,
                                  sep="\t",
                                  na_rep="NaN",
                                  float_format="%.4g",
                                  index_label="sequence")
         self.df_file[key] = fname
         self.df_dict[key] = None
Ejemplo n.º 4
0
    def __init__(self, config):
        DataContainer.__init__(self, config)

        try:
            self.timepoint = int(config['timepoint'])
        except KeyError as key:
            raise EnrichError("Missing required config value '%s'" % key,
                              self.name)
        except ValueError as value:
            raise EnrichError("Invalid parameter value %s" % value, self.name)

        if 'align variants' in config:
            if config['align variants']:
                self.aligner = Aligner()
            else:
                self.aligner = None
        else:
            self.aligner = None

        if 'report filtered reads' in config:
            self.report_filtered_reads = config['report filtered reads']
        else:
            self.report_filtered_reads = self.verbose

        # initialize data
        self.counts = dict()  # pandas dataframes
        self.counts_file = dict()  # paths to saved counts
        self.filters = None  # dictionary
        self.filter_stats = None  # dictionary
Ejemplo n.º 5
0
 def report_filter_stats(self):
     try:
         output_dir = os.path.join(self.output_base,
                                   fix_filename(self.name))
     except AttributeError:
         raise EnrichError("Invalid output directory specified for object",
                           self.name)
     try:
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
     except OSError:
         raise EnrichError("Failed to create output directory", self.name)
     with open(os.path.join(output_dir, "filter_stats.txt"), "w") as handle:
         elements = list()
         for key in sorted(self.filter_stats,
                           key=self.filter_stats.__getitem__,
                           reverse=True):
             if key != 'total':
                 print(DataContainer._filter_messages[key],
                       self.filter_stats[key],
                       sep="\t",
                       file=handle)
         print('total', self.filter_stats['total'], sep="\t", file=handle)
         logging.info(
             "Wrote filtering statistics [{name}]".format(name=self.name))
Ejemplo n.º 6
0
    def write_data(self, directory=None, keys=None):
        """
        Save the :py:class:`pandas.DataFrame` objects as tab-separated files in a new subdirectory of *directory* 
        with the same name as the object. If *directory* is ``None``, files will be saved to the object's default output directory.

        The optional *keys* parameter is a list of types of counts to be 
        saved. By default, all counts are saved.
        """
        if keys is None:
            keys = self.df_dict.keys()
        for key in keys:
            try:
                output_dir = os.path.join(self.output_base,
                                          fix_filename(self.name))
            except AttributeError:
                raise EnrichError(
                    "Invalid output directory specified for object", self.name)
            try:
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
            except OSError:
                raise EnrichError("Failed to create output directory",
                                  self.name)
            fname = os.path.join(output_dir, fix_filename(key + ".tsv"))
            self.df_dict[key].to_csv(fname,
                                     sep="\t",
                                     na_rep="NaN",
                                     float_format="%.4g",
                                     index_label="sequence")
Ejemplo n.º 7
0
    def __init__(self, config):
        VariantSeqLib.__init__(self, config)
        try:
            if 'forward' in config['fastq'] and 'reverse' in config['fastq']:
                raise EnrichError("Multiple FASTQ files specified", self.name)
            elif 'forward' in config['fastq']:
                self.reads = config['fastq']['forward']
                self.revcomp_reads = False
            elif 'reverse' in config['fastq']:
                self.reads = config['fastq']['reverse']
                self.revcomp_reads = True
            else:
                raise KeyError("'forward' or 'reverse'")

            self.set_filters(
                config['filters'], {
                    'min quality': 0,
                    'avg quality': 0,
                    'chastity': False,
                    'max mutations': len(self.wt_dna)
                })
        except KeyError as key:
            raise EnrichError("missing required config value: %s" % key,
                              self.name)

        try:
            check_fastq(self.reads)
        except IOError as fqerr:
            raise EnrichError("FASTQ file error: %s" % fqerr, self.name)
Ejemplo n.º 8
0
    def __init__(self, config, parent=True):
        if parent:
            SeqLib.__init__(self, config)
        self.wt_dna = None
        self.wt_protein = None
        self.aligner = None
        self.aligner_cache = None

        try:
            self.set_wt(config['wild type']['sequence'],
                        coding=config['wild type']['coding'])
            if 'align variants' in config:
                if config['align variants']:
                    self.aligner = Aligner()
                    self.aligner_cache = dict()

        except KeyError as key:
            raise EnrichError(
                "Missing required config value '{key}'".format(key), self.name)

        if 'reference offset' in config['wild type']:
            try:
                self.reference_offset = int(
                    config['wild type']['reference offset'])
            except ValueError:
                raise EnrichError("Invalid reference offset value", self.name)
        else:
            self.reference_offset = 0

        self.df_dict['variants'] = None
Ejemplo n.º 9
0
    def __init__(self, config):
        DataContainer.__init__(self, config)
        self.conditions = dict()
        self.control = None
        self.use_scores = True
        self.normalize_wt = False

        try:
            if 'normalize wt' in config:
                if config['normalize wt'] is True:
                    self.normalize_wt = True
            for cnd in config['conditions']:
                if not cnd['label'].isalnum():
                    raise EnrichError(
                        "Alphanumeric label required for condition '{label}'".
                        format(label=cnd['label']), self.name)
                for sel_config in cnd[
                        'selections']:  # assign output base if not present
                    if 'output directory' not in sel_config:
                        sel_config['output directory'] = self.output_base
                if cnd['label'] not in self.conditions:
                    self.conditions[cnd['label']] = [
                        selection.Selection(x) for x in cnd['selections']
                    ]
                else:
                    raise EnrichError(
                        "Non-unique condition label '{label}'".format(
                            label=cnd['label']), self.name)
                if 'control' in cnd:
                    if cnd['control']:
                        if self.control is None:
                            self.control = self.conditions[cnd['label']]
                        else:
                            raise EnrichError("Multiple control conditions",
                                              self.name)
        except KeyError as key:
            raise EnrichError(
                "Missing required config value {key}".format(key=key),
                self.name)

        all_selections = list()
        for key in self.conditions:
            all_selections.extend(self.conditions[key])
        for dtype in all_selections[0].df_dict:
            if all(dtype in x.df_dict for x in all_selections):
                self.df_dict[dtype] = True
        if len(self.df_dict.keys()) == 0:
            raise EnrichError(
                "No enrichment data present across all selections", self.name)

        # ensure consistency for score usage
        if not all(x.use_scores for x in all_selections):
            self.use_scores = False

        # ensure consistency for wild type normalization
        for sel in all_selections:
            sel.normalize_wt = self.normalize_wt
Ejemplo n.º 10
0
    def __init__(self, mapfile):
        self.name = "barcodemap_{fname}".format(
            fname=os.path.basename(mapfile))
        self.filename = mapfile
        self.variants = dict()
        self.bc_variant_strings = dict()

        try:
            ext = os.path.splitext(mapfile)[-1].lower()
            if ext in (".bz2"):
                handle = bz2.BZ2File(mapfile, "rU")
            elif ext in (".gz"):
                handle = gzip.GzipFile(mapfile, "rU")
            else:
                handle = open(mapfile, "rU")
        except IOError:
            raise EnrichError(
                "Could not open barcode map file '{fname}'".format(
                    fname=mapfile), self.name)

        for line in handle:
            # skip comments and whitespace-only lines
            if len(line.strip()) == 0 or line[0] == '#':
                continue

            try:
                barcode, variant = line.strip().split()
            except ValueError:
                raise EnrichError("Unexpected barcode-variant line format",
                                  self.name)

            if not re.match("^[ACGTacgt]+$", barcode):
                raise EnrichError(
                    "Barcode DNA sequence contains unexpected "
                    "characters", self.name)
            if not re.match("^[ACGTNacgtn]+$", variant):
                raise EnrichError(
                    "Variant DNA sequence contains unexpected "
                    "characters", self.name)

            barcode = barcode.upper()
            variant = variant.upper()
            if barcode in self:
                if self[barcode] != variant:
                    raise EnrichError(
                        "Barcode '{bc}' assigned to multiple unique variants".
                        format(bc=barcode), self.name)
            else:
                self[barcode] = variant
        handle.close()
Ejemplo n.º 11
0
    def __init__(self, mapfile):
        self.name = "mapfile_%s" % mapfile
        try:
            handle = open(mapfile, "U")
        except IOError:
            raise EnrichError("Could not open barcode map file '%s'" \
                    % mapfile, self.name)

        self.filename = mapfile
        for line in handle:
            # skip comments and whitespace-only lines
            if len(line.strip()) == 0 or line[0] == '#':
                continue

            try:
                barcode, variant = line.strip().split()
            except ValueError:
                raise EnrichError("Unexpected barcode-variant line format",
                                  self.name)

            if not re.match("^[ACGTacgt]+$", barcode):
                raise EnrichError(
                    "Barcode DNA sequence contains unexpected "
                    "characters", self.name)
            if not re.match("^[ACGTNacgtn]+$", variant):
                raise EnrichError(
                    "Variant DNA sequence contains unexpected "
                    "characters", self.name)

            barcode = barcode.upper()
            variant = variant.upper()
            if barcode in self:
                if self[barcode] != variant:
                    raise EnrichError(
                        "Barcode '%s' assigned to multiple "
                        "unique variants" % barcode, self.name)
            else:
                self[barcode] = variant
        handle.close()

        # build the variants dictionary
        self.variants = dict()
        for bc in self.keys():
            if self[bc] not in self.variants:
                self.variants[self[bc]] = list()
            self.variants[self[bc]].append(bc)

        logging.info("Assigned %d barcodes to %d variants [%s]" % \
                     (len(self.keys()), len(self.variants.keys()), self.name))
Ejemplo n.º 12
0
    def __init__(self, config, barcodevariant=False):
        self.barcodevariant = barcodevariant
        if not self.barcodevariant:
            SeqLib.__init__(self, config)
        try:
            if 'forward' in config['fastq'] and 'reverse' in config['fastq']:
                raise EnrichError("Multiple FASTQ files specified", self.name)
            elif 'forward' in config['fastq']:
                self.reads = config['fastq']['forward']
                self.revcomp_reads = False
            elif 'reverse' in config['fastq']:
                self.reads = config['fastq']['reverse']
                self.revcomp_reads = True
            else:
                raise KeyError("'forward' or 'reverse'")

            if 'start' in config['fastq']:
                self.bc_start = config['fastq']['start']
            else:
                self.bc_start = 1
            if 'length' in config['fastq']:
                self.bc_length = config['fastq']['length']
            else:
                self.bc_length = 2147483647  # longer than any read... for now

            if 'min count' in config['barcodes']:
                self.min_count = config['barcodes']['min count']
            else:
                self.min_count = 0

            self.set_filters(config['filters'], {
                'min quality': 0,
                'avg quality': 0,
                'chastity': False
            })
        except KeyError as key:
            raise EnrichError(
                "Missing required config value {key}".format(key=key),
                self.name)

        try:
            check_fastq(self.reads)
        except IOError as fqerr:
            raise EnrichError("FASTQ file error: {error}".format(error=fqerr),
                              self.name)

        self.df_dict['barcodes'] = None
        if self.min_count > 0:
            self.df_dict['barcodes_low_abundance'] = None
Ejemplo n.º 13
0
    def __init__(self, config):
        DataContainer.__init__(self, config)

        try:
            self.timepoint = int(config['timepoint'])
        except KeyError as key:
            raise EnrichError("Missing required config value '{key}'".format(key=key), 
                              self.name)
        except ValueError as value:
            raise EnrichError("Invalid parameter value {value}".format(value=value), self.name)

        if 'report filtered reads' in config:
            self.report_filtered = config['report filtered reads']
        else:
            self.report_filtered = False
Ejemplo n.º 14
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented),
        performs quality-based filtering, and counts the variants.
        """
        self.df_dict['variants'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        logging.info("Counting variants [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the read based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if not any(filter_flags.values()):  # passed quality filtering
                mutations = self.count_variant(fq.sequence)
                if mutations is None:  # read has too many mutations
                    self.filter_stats['max mutations'] += 1
                    filter_flags['max mutations'] = True
            if any(filter_flags.values()):
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)

        self.df_dict['variants'] = \
                pd.DataFrame.from_dict(self.df_dict['variants'],
                                       orient="index", dtype="int32")
        if len(self.df_dict['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.df_dict['variants'].columns = ['count']
        self.df_dict['variants'].sort('count', ascending=False, inplace=True)

        logging.info("Counted {n} variants ({u} unique) [{name}]".format(
            n=self.df_dict['variants']['count'].sum(),
            u=len(self.df_dict['variants'].index),
            name=self.name))
        if self.aligner is not None:
            logging.info("Aligned {n} variants [{name}]".format(
                n=self.aligner.calls, name=self.name))
            self.aligner_cache = None
        self.report_filter_stats()
Ejemplo n.º 15
0
    def calculate(self):
        """
        Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 
        variant counts using the :py:class:`BarcodeMap`.
        """
        BarcodeSeqLib.calculate(self)  # count the barcodes
        self.df_dict['variants'] = dict()

        logging.info(
            "Converting barcodes to variants [{name}]".format(name=self.name))
        if self.filter_unmapped:
            map_mask = self.df_dict['barcodes'].index.isin(self.barcode_map)
            self.df_dict['barcodes_unmapped'] = self.df_dict['barcodes'][
                -map_mask]
            self.df_dict['barcodes'] = self.df_dict['barcodes'][map_mask]
            del map_mask
            logging.info(
                "Writing counts for {n} unique unmapped barcodes to disk [{name}]"
                .format(n=len(self.df_dict['barcodes_unmapped']),
                        name=self.name))
            self.dump_data(keys=['barcodes_unmapped'])  # save memory

        # count variants associated with the barcodes
        for bc, count in self.df_dict['barcodes'].iterrows():
            count = count['count']
            variant = self.barcode_map[bc]
            mutations = self.count_variant(variant, copies=count)
            if mutations is None:  # variant has too many mutations
                self.filter_stats['max mutations'] += count
                self.filter_stats['total'] += count
                if self.report_filtered:
                    self.report_filtered_variant(variant, count)
                if bc not in self.barcode_map.bc_variant_strings:
                    self.barcode_map.bc_variant_strings[bc] = FILTERED_VARIANT
            else:
                if mutations not in self.barcode_map.variants:
                    self.barcode_map.variants[mutations] = set()
                self.barcode_map.variants[mutations].update([bc])
                self.barcode_map.bc_variant_strings[bc] = mutations


        self.df_dict['variants'] = \
                pd.DataFrame.from_dict(self.df_dict['variants'],
                                       orient="index", dtype="int32")
        if len(self.df_dict['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.df_dict['variants'].columns = ['count']
        self.df_dict['variants'].sort('count', ascending=False, inplace=True)

        logging.info(
            "Retained counts for {n} variants ({u} unique) [{name}]".format(
                n=self.df_dict['variants']['count'].sum(),
                u=len(self.df_dict['variants'].index),
                name=self.name))
        if self.aligner is not None:
            logging.info("Aligned {n} variants [{name}]".format(
                n=self.aligner.calls, name=self.name))
            self.aligner_cache = None
        self.report_filter_stats()
Ejemplo n.º 16
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.counts['barcodes'] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()):  # failed quality filtering
                self.filter_stats['total'] += 1
                if self.verbose:
                    self.report_filtered_read(fq, filter_flags)
            else:  # passed quality filtering
                try:
                    self.counts['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.counts['barcodes'][fq.sequence.upper()] = 1

        self.counts['barcodes'] = \
                pd.DataFrame.from_dict(self.counts['barcodes'],
                                       orient="index", dtype="int32")
        if len(self.counts['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.counts['barcodes'].columns = ['count']
        self.counts['barcodes'] = \
                self.counts['barcodes'][self.counts['barcodes']['count'] \
                    > self.min_count]

        logging.info("Counted %d barcodes (%d unique) [%s]" % \
                (self.counts['barcodes']['count'].sum(), len(self.counts['barcodes'].index), self.name))
        if not self.barcodevariant:
            self.report_filter_stats()
Ejemplo n.º 17
0
    def __init__(self, config):
        DataContainer.__init__(self, config)
        self.conditions = dict()
        self.control = None
        self.use_scores = True

        try:
            for cnd in config['conditions']:
                if not cnd['label'].isalnum():
                    raise EnrichError(
                        "Alphanumeric label required for condition '%s'" %
                        cnd['label'], self.name)
                for sel_config in cnd[
                        'selections']:  # assign output base if not present
                    if 'output directory' not in sel_config:
                        sel_config['output directory'] = self.output_base
                self.conditions[cnd['label']] = [
                    selection.Selection(x) for x in cnd['selections']
                ]
                if cnd['control']:
                    if self.control is None:
                        self.control = self.conditions[cnd['label']]
                    else:
                        raise EnrichError("Multiple control conditions",
                                          self.name)
        except KeyError as key:
            raise EnrichError("Missing required config value %s" % key,
                              self.name)

        all_selections = list()
        for key in self.conditions:
            all_selections.extend(self.conditions[key])
        for dtype in all_selections[0].df_dict:
            if all(dtype in x.df_dict for x in all_selections):
                self.df_dict[dtype] = True
        if len(self.df_dict.keys()) == 0:
            raise EnrichError(
                "No enrichment data present across all selections", self.name)

        for key in self.conditions:
            if any(len(x.timepoints) == 2 for x in self.conditions[key]):
                self.use_scores = False
Ejemplo n.º 18
0
 def set_output_base(self, dirname):
     """
     Sets the object's base output directory (used for 
     :py:meth:`dump_data` and other class-specific methods) to *dirname* 
     and creates the directory if it doesn't exist.
     """
     try:
         if not os.path.exists(dirname):
             os.makedirs(dirname)
     except OSError:
         raise EnrichError("Failed to create output directory", self.name)
     self.output_base = dirname
Ejemplo n.º 19
0
    def set_wt(self, sequence, coding=True):
        """
        Set the wild type DNA *sequence*. The *sequence* is translated if *coding* 
        is ``True``. The *sequence* may only contain ``ACGT``, but may 
        contain whitespace (which will be removed). If *coding*, *sequence* must be in-frame.
        """
        sequence = "".join(sequence.split())  # remove whitespace

        if not re.match("^[ACGTacgt]+$", sequence):
            raise EnrichError(
                "WT DNA sequence contains unexpected "
                "characters", self.name)
        if len(sequence) % 3 != 0 and coding:
            raise EnrichError("WT DNA sequence contains incomplete codons",
                              self.name)

        self.wt_dna = sequence.upper()
        if coding:
            self.wt_protein = ""
            for i in xrange(0, len(self.wt_dna), 3):
                self.wt_protein += codon_table[self.wt_dna[i:i + 3]]
        else:
            self.wt_protein = None
Ejemplo n.º 20
0
    def __init__(self, config):
        self.name = "Unnamed" + self.__class__.__name__
        self.df_dict = dict()
        self.df_files = dict()
        self.filters = None
        self.filter_stats = None
        self.output_base = None

        try:
            self.name = config['name']
        except KeyError as key:
            raise EnrichError(
                "Missing required config value {key}".format(key=key),
                self.name)

        if 'output directory' in config:
            self.set_output_base(config['output directory'])
Ejemplo n.º 21
0
    def write_variants(self, fname):
        """
        Write a list of barcodes for each variant to the file *fname*.
        """
        try:
            handle = open(fname, "w")
        except IOError:
            raise EnrichError(
                "Could not open variant barcode map file '%s' "
                "for writing" % fname, self.name)
        for variant, barcodes in \
                sorted(self.variants.items(), key=lambda x:x[1]):
            print(variant, ", ".join(barcodes), sep="\t", file=handle)
        handle.close()

        logging.info('Wrote BarcodeMap variants file "%s" [%s]' %
                     (fname, self.name))
Ejemplo n.º 22
0
    def calculate(self):
        """
        Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 
        variant counts using the :py:class:`BarcodeMap`.
        """
        BarcodeSeqLib.calculate(self)  # count the barcodes
        self.counts['variants'] = dict()

        if self.filter_unmapped:
            map_mask = self.counts['barcodes'].index.isin(self.barcode_map)
            self.counts['barcodes_unmapped'] = self.counts['barcodes'][
                -map_mask]
            self.counts['barcodes'] = self.counts['barcodes'][map_mask]
            del map_mask

        # count variants associated with the barcodes
        for bc, count in self.counts['barcodes'].iterrows():
            count = count['count']
            variant = self.barcode_map[bc]
            mutations = self.count_variant(variant, copies=count)
            if mutations is None:  # variant has too many mutations
                self.filter_stats['max mutations'] += count
                self.filter_stats['total'] += count
                if self.verbose:
                    self.report_filtered_variant(variant, count)
            else:
                if mutations not in self.barcode_map.variants:
                    self.barcode_map.variants[mutations] = list()
                if bc not in self.barcode_map.variants[mutations]:
                    self.barcode_map.variants[mutations].append(bc)

        self.counts['variants'] = \
                pd.DataFrame.from_dict(self.counts['variants'],
                                       orient="index", dtype="int32")
        if len(self.counts['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.counts['variants'].columns = ['count']

        logging.info("Counted %d variants (%d unique) [%s]" % \
                (self.counts['variants']['count'].sum(), len(self.counts['variants'].index), self.name))
        if self.aligner is not None:
            logging.info("Aligned %d variants [%s]" %
                         (self.aligner.calls, self.name))
        self.report_filter_stats()
Ejemplo n.º 23
0
    def write_data(self, subdirectory=None, keys=None):
        """
        Save the :py:class:`pandas.DataFrame` objects as tab-separated files 
        with the same name as the object.

        The optional *keys* parameter is a list of types of data to be 
        saved (variant, barcode, etc.). By default, all data are saved.

        Returns a dictionary with *keys* as the keys and corresponding filenames for the ``.tsv`` files as the values. This dictionary is required by :py:meth:`dump_data`
        """
        fname_dict = dict()
        if subdirectory is not None:
            directory = os.path.join(self.output_base,
                                     fix_filename(subdirectory),
                                     fix_filename(self.name))
        else:
            directory = os.path.join(self.output_base, fix_filename(self.name))
        if keys is None:
            keys = self.df_dict.keys()
        keys = [k for k in keys if self.df_dict[k] is not None]
        for key in keys:
            try:
                if not os.path.exists(directory):
                    os.makedirs(directory)
            except OSError:
                raise EnrichError("Failed to create output directory",
                                  self.name)
            fname = os.path.join(directory, fix_filename(key + ".tsv"))
            self.df_dict[key].to_csv(fname,
                                     sep="\t",
                                     na_rep="NaN",
                                     float_format="%.4g",
                                     index_label="sequence")
            fname_dict[key] = fname
        logging.info("Successfully wrote data frames ({keys}) [{name}]".format(
            name=self.name, keys=", ".join(keys)))
        return fname_dict
Ejemplo n.º 24
0
    def __init__(self, config):
        DataContainer.__init__(self, config)
        self.libraries = dict()
        self.timepoints = list()

        try:
            if 'barcodes' in config:
                if 'map file' in config['barcodes']:
                    self.barcode_map = BarcodeMap(
                        config['barcodes']['map file'])
                else:
                    self.barcode_map = None
            else:
                self.barcode_map = None

            libnames = list()
            for lib in config['libraries']:
                if 'output directory' not in lib:
                    lib['output directory'] = self.output_base
                libtype = seqlib_type(lib)
                if libtype is None:
                    raise EnrichError("Unrecognized SeqLib config", self.name)
                elif libtype == "BarcodeVariantSeqLib":
                    new = BarcodeVariantSeqLib(lib,
                                               barcode_map=self.barcode_map)
                else:
                    new = globals()[libtype](lib)

                if new.output_base is None:
                    new.set_output_base(self.output_base)

                if new.timepoint not in self.libraries:
                    self.libraries[new.timepoint] = list()
                self.libraries[new.timepoint].append(new)
                libnames.append(new.name)
            self.timepoints = sorted(self.libraries.keys())

            if len(set(libnames)) != len(libnames):
                raise EnrichError("Non-unique library names", self.name)

            self.set_filters(
                config['filters'], {
                    'min count': 0,
                    'min input count': 0,
                    'min rsquared': 0.0,
                    'max barcode variation': None
                })

            if 'carryover correction' in config:
                if config['carrover correction']['method'] == "nonsense":
                    self.ns_carryover_fn = nonsense_ns_carryover_apply_fn
                    self.ns_carryover_kwargs = {
                        'position':
                        int(config['carryover correction']['position'])
                    }
                # add additional methods here using "elif" blocks
                else:
                    raise EnrichError(
                        "Unrecognized nonspecific carryover correction",
                        self.name)
            else:
                self.ns_carryover_fn = None
                self.ns_carryover_kwargs = None

        except KeyError as key:
            raise EnrichError("Missing required config value %s" % key,
                              self.name)
        except ValueError as value:
            raise EnrichError("Invalid parameter value %s" % value, self.name)

        if len(self.libraries.keys()) < 2:
            raise EnrichError("Insufficient number of timepoints", self.name)

        if 0 not in self.timepoints:
            raise EnrichError("Missing timepoint 0", self.name)
        if self.timepoints[0] != 0:
            raise EnrichError("Invalid negative timepoint", self.name)

        # identify what kind of counts data is present in all timepoints
        dtype_counts = list()
        for tp in self.timepoints:
            for lib in self.libraries[tp]:
                dtype_counts.extend(lib.counts.keys())
        dtype_counts = Counter(dtype_counts)
        for dtype in dtype_counts:
            if dtype_counts[dtype] == len(config['libraries']):
                self.df_dict[dtype] = True
        if 'barcodes_unmapped' in self.df_dict.keys(
        ):  # special case for BarcodeVariantSeqLib
            del self.df_dict['barcodes_unmapped']
        if len(self.df_dict.keys()) == 0:
            raise EnrichError("No count data present across all timepoints",
                              self.name)

        try:
            if 'correction' in config:
                if config['correction']['method'] == "stop":
                    if not self.libraries[0].is_coding():
                        raise EnrichError(
                            "Invalid correction method for "
                            "noncoding sequences", self.name)
                    else:
                        config['correction']['length percentile']  # must exist
                        self.correction = config['correction']
            else:
                self.correction = None
        except KeyError as key:
            raise EnrichError("Missing required config value %s" % key,
                              self.name)
Ejemplo n.º 25
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.df_dict['barcodes'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        logging.info("Counting barcodes [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()):  # failed quality filtering
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)
            else:  # passed quality filtering
                try:
                    self.df_dict['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.df_dict['barcodes'][fq.sequence.upper()] = 1

        self.df_dict['barcodes'] = \
                pd.DataFrame.from_dict(self.df_dict['barcodes'],
                                       orient="index", dtype="int32")
        if len(self.df_dict['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.df_dict['barcodes'].columns = ['count']
        self.df_dict['barcodes'].sort('count', ascending=False, inplace=True)
        if 'barcodes_low_abundance' in self.df_dict:  # min count is set
            self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][
                self.df_dict['barcodes']['count'] < self.min_count]
            logging.info(
                "Writing counts for {n} unique low-abundance barcodes to disk [{name}]"
                .format(n=len(self.df_dict['barcodes_low_abundance']),
                        name=self.name))
            self.dump_data(keys=['barcodes_low_abundance'])
            self.df_dict['barcodes'] = self.df_dict['barcodes'][
                self.df_dict['barcodes']['count'] >= self.min_count]

        logging.info(
            "Retained counts for {n} barcodes ({u} unique) [{name}]".format(
                n=self.df_dict['barcodes']['count'].sum(),
                u=len(self.df_dict['barcodes'].index),
                name=self.name))
        if not self.barcodevariant:
            self.report_filter_stats()
Ejemplo n.º 26
0
                        action="store_true",
                        default=False,
                        dest="report_filtered",
                        help="output filtered reads to log file")
    parser.add_argument("--no-plots",
                        help="don't make plots",
                        dest="plots",
                        action="store_false",
                        default=True)
    args = parser.parse_args()

    if args.report_filtered:
        log_level = logging.DEBUG
        if args.log is None:
            raise EnrichError(
                "Cannot report filtered reads without a log file",
                _DRIVER_NAME)
    else:
        log_level = logging.INFO

    if args.log:
        logging.basicConfig(filename=args.log, level=log_level)

    try:
        config = json.load(open(args.config, "U"))
    except IOError:
        raise EnrichError('Failed to open "%s"' % args.config, _DRIVER_NAME)
    except ValueError:
        raise EnrichError("Improperly formatted .json file", _DRIVER_NAME)

    if config_check.is_experiment(config):
Ejemplo n.º 27
0
    def calculate(self):
        """
        Reads the forward and reverse reads, merges them, performs 
        quality-based filtering, and counts the variants.
        """
        self.counts['variants'] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        for fwd, rev in read_fastq_multi([self.forward, self.reverse]):
            for key in filter_flags:
                filter_flags[key] = False

            # filter the read based on specified quality settings
            if self.filters['chastity']:
                if not fwd.is_chaste():
                    filter_flags['chastity'] = True
                    if self.verbose:
                        self.report_filtered_read(fwd, filter_flags)
                if not rev.is_chaste():
                    filter_flags['chastity'] = True
                    if self.verbose:
                        self.report_filtered_read(rev, filter_flags)
                if filter_flags['chastity']:
                    self.filter_stats['chastity'] += 1
                    self.filter_stats['total'] += 1
                    continue
            merge = self.merge_reads(fwd, rev)
            if merge is None: # merge failed
                self.filter_stats['merge failure'] += 1
                self.filter_stats['total'] += 1
                filter_flags['merge failure'] = True
                if self.verbose:
                    self.report_filtered_read(fwd, filter_flags)
                    self.report_filtered_read(rev, filter_flags)
            else:
                if self.filters['remove unresolvable']:
                    if 'X' in merge.sequence:
                        self.filter_stats['remove unresolvable'] += 1
                        filter_flags['remove unresolvable'] = True
                if self.filters['min quality'] > 0:
                    if merge.min_quality() < self.filters['min quality']:
                        self.filter_stats['min quality'] += 1
                        filter_flags['min quality'] = True
                if self.filters['avg quality'] > 0:
                    if merge.mean_quality() < self.filters['avg quality']:
                        self.filter_stats['avg quality'] += 1
                        filter_flags['avg quality'] = True
                if not any(filter_flags.values()): # passed quality filtering
                    mutations = self.count_variant(merge.sequence)
                    if mutations is None: # merge read has too many mutations
                        self.filter_stats['max mutations'] += 1
                        filter_flags['max mutations'] = True
                if any(filter_flags.values()):
                    self.filter_stats['total'] += 1
                    if self.verbose:
                        self.report_filtered_read(merge, filter_flags)

        self.counts['variants'] = \
                pd.DataFrame.from_dict(self.counts['variants'], 
                                       orient="index", dtype="int32")
        if len(self.counts['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.counts['variants'].columns = ['count']

        logging.info("Counted %d variants (%d unique) [%s]" % \
                (self.counts['variants']['count'].sum(), len(self.counts['variants'].index), self.name))
        if self.aligner is not None:
            logging.info("Aligned %d variants [%s]" % (self.aligner.calls, self.name))
        self.report_filter_stats()
Ejemplo n.º 28
0
    def count_variant(self, variant_dna, copies=1, include_indels=True):
        """
        Identifies mutations and counts the *variant_dna* sequence.
        The algorithm attempts to call variants by comparing base-by-base.
        If the *variant_dna* and wild type DNA are different lengths, or if there
        are an excess of mismatches (indicating a possible indel), local
        alignment is performed using :py:meth:`align_variant` if this option 
        has been selected in the configuration.

        Each variant is stored as a tab-delimited string of mutations in HGVS 
        format. Returns a list of HGSV variant strings. Returns an empty list 
        if the variant is wild type. Returns None if the variant was discarded
        due to excess mismatches.
        """
        if not re.match("^[ACGTNXacgtnx]+$", variant_dna):
            raise EnrichError(
                "Variant DNA sequence contains unexpected "
                "characters", self.name)

        variant_dna = variant_dna.upper()

        if len(variant_dna) != len(self.wt_dna):
            if self.aligner is not None:
                mutations = self.align_variant(variant_dna)
            else:
                return None
        else:
            mutations = list()
            for i in xrange(len(variant_dna)):
                if variant_dna[i] != self.wt_dna[i]:
                    mutations.append(
                        (i, "{pre}>{post}".format(pre=self.wt_dna[i],
                                                  post=variant_dna[i])))
                    if len(mutations) > self.filters['max mutations']:
                        if self.aligner is not None:
                            mutations = self.align_variant(variant_dna)
                            if len(mutations) > self.filters['max mutations']:
                                # too many mutations post-alignment
                                return None
                            else:
                                # stop looping over this variant
                                break
                        else:
                            # too many mutations and not using aligner
                            return None

        mutation_strings = list()
        if self.is_coding():
            variant_protein = ""
            for i in xrange(0, len(variant_dna), 3):
                try:
                    variant_protein += codon_table[variant_dna[i:i + 3]]
                except KeyError:  # garbage codon due to indel
                    variant_protein += '?'

            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                ref_pro_pos = (pos + self.reference_offset) / 3 + 1
                mut = "c.{pos}{change}".format(pos=ref_dna_pos, change=change)
                if has_indel(change):
                    mut += " (p.{pre}{pos}fs)".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos)
                elif variant_protein[pos / 3] == self.wt_protein[pos / 3]:
                    mut += " (p.=)"
                else:
                    mut += " (p.{pre}{pos}{post})".format(
                        pre=aa_codes[self.wt_protein[pos / 3]],
                        pos=ref_pro_pos,
                        post=aa_codes[variant_protein[pos / 3]])
                mutation_strings.append(mut)
        else:
            for pos, change in mutations:
                ref_dna_pos = pos + self.reference_offset + 1
                mut = "n.{pos}{change}".format(pos=ref_dna_pos, change=change)
                mutation_strings.append(mut)

        if len(mutation_strings) > 0:
            variant_string = ', '.join(mutation_strings)
        else:
            variant_string = WILD_TYPE_VARIANT
        try:
            self.df_dict['variants'][variant_string] += copies
        except KeyError:
            self.df_dict['variants'][variant_string] = copies
        return variant_string