Esempio n. 1
0
def trim_fastq(outdir, files, start, end, length):
    """
    """
    if len(files) == 0:
        print("Error: no files provided", file=stderr)
        return

    elif length is not None:
        length_mode = True
        if start is not None and end is not None:
            if (end - start + 1) != length:
                print("Error: start/end/length do not agree", file=stderr)
                return
        elif end is not None:  # start is None, so length should equal end
            if length != end:
                print("Error: length/end do not agree", file=stderr)
                return
        elif start is None:
            start = 1
    else:
        length_mode = False
        if start is None:
            start = 1

    for f in files:
        name, ext = os.path.splitext(os.path.basename(f))
        outfile_name = name + ".trim" + ext
        outfile_name = os.path.join(outdir, outfile_name)
        with open(outfile_name, "w") as handle:
            for read in read_fastq(f):
                if length_mode:
                    read.trim_length(length, start)
                else:
                    read.trim(start, end)
                print(read, file=handle)
Esempio n. 2
0
def trim_fastq(outdir, files, start, end, length):
    """
    """
    if len(files) == 0:
        print("Error: no files provided", file=stderr)
        return

    elif length is not None:
        length_mode = True
        if start is not None and end is not None:
            if (end - start + 1) != length:
                print("Error: start/end/length do not agree", file=stderr)
                return
        elif end is not None: # start is None, so length should equal end
            if length != end:
                print("Error: length/end do not agree", file=stderr)
                return
        elif start is None:
            start = 1
    else:
        length_mode = False
        if start is None:
            start = 1

    for f in files:
        name, ext = os.path.splitext(os.path.basename(f))
        outfile_name = name + ".trim" + ext
        outfile_name = os.path.join(outdir, outfile_name)
        with open(outfile_name, "w") as handle:
            for read in read_fastq(f):
                if length_mode:
                    read.trim_length(length, start)
                else:
                    read.trim(start, end)
                print(read, file=handle)
Esempio n. 3
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.counts['barcodes'] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()):  # failed quality filtering
                self.filter_stats['total'] += 1
                if self.verbose:
                    self.report_filtered_read(fq, filter_flags)
            else:  # passed quality filtering
                try:
                    self.counts['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.counts['barcodes'][fq.sequence.upper()] = 1

        self.counts['barcodes'] = \
                pd.DataFrame.from_dict(self.counts['barcodes'],
                                       orient="index", dtype="int32")
        if len(self.counts['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.counts['barcodes'].columns = ['count']
        self.counts['barcodes'] = \
                self.counts['barcodes'][self.counts['barcodes']['count'] \
                    > self.min_count]

        logging.info("Counted %d barcodes (%d unique) [%s]" % \
                (self.counts['barcodes']['count'].sum(), len(self.counts['barcodes'].index), self.name))
        if not self.barcodevariant:
            self.report_filter_stats()
Esempio n. 4
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.counts['barcodes'] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()): # failed quality filtering
                self.filter_stats['total'] += 1
                if self.verbose:
                    self.report_filtered_read(fq, filter_flags)
            else: # passed quality filtering
                try:
                    self.counts['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.counts['barcodes'][fq.sequence.upper()] = 1

        self.counts['barcodes'] = \
                pd.DataFrame.from_dict(self.counts['barcodes'], 
                                       orient="index", dtype="int32")
        if len(self.counts['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.counts['barcodes'].columns = ['count']
        self.counts['barcodes'] = \
                self.counts['barcodes'][self.counts['barcodes']['count'] \
                    > self.min_count]

        logging.info("Counted %d barcodes (%d unique) [%s]" % \
                (self.counts['barcodes']['count'].sum(), len(self.counts['barcodes'].index), self.name))
        if not self.barcodevariant:
            self.report_filter_stats()
Esempio n. 5
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented),
        performs quality-based filtering, and counts the variants.
        """
        self.df_dict['variants'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        logging.info("Counting variants [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the read based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if not any(filter_flags.values()):  # passed quality filtering
                mutations = self.count_variant(fq.sequence)
                if mutations is None:  # read has too many mutations
                    self.filter_stats['max mutations'] += 1
                    filter_flags['max mutations'] = True
            if any(filter_flags.values()):
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)

        self.df_dict['variants'] = \
                pd.DataFrame.from_dict(self.df_dict['variants'],
                                       orient="index", dtype="int32")
        if len(self.df_dict['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.df_dict['variants'].columns = ['count']
        self.df_dict['variants'].sort('count', ascending=False, inplace=True)

        logging.info("Counted {n} variants ({u} unique) [{name}]".format(
            n=self.df_dict['variants']['count'].sum(),
            u=len(self.df_dict['variants'].index),
            name=self.name))
        if self.aligner is not None:
            logging.info("Aligned {n} variants [{name}]".format(
                n=self.aligner.calls, name=self.name))
            self.aligner_cache = None
        self.report_filter_stats()
Esempio n. 6
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented),
        performs quality-based filtering, and counts the variants.
        """
        self.df_dict['variants'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        logging.info("Counting variants [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the read based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if not any(filter_flags.values()): # passed quality filtering
                mutations = self.count_variant(fq.sequence)
                if mutations is None: # read has too many mutations
                    self.filter_stats['max mutations'] += 1
                    filter_flags['max mutations'] = True
            if any(filter_flags.values()):
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)

        self.df_dict['variants'] = \
                pd.DataFrame.from_dict(self.df_dict['variants'], 
                                       orient="index", dtype="int32")
        if len(self.df_dict['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.df_dict['variants'].columns = ['count']
        self.df_dict['variants'].sort('count', ascending=False, inplace=True)

        logging.info("Counted {n} variants ({u} unique) [{name}]".format(
                n=self.df_dict['variants']['count'].sum(), u=len(self.df_dict['variants'].index), name=self.name))
        if self.aligner is not None:
            logging.info("Aligned {n} variants [{name}]".format(n=self.aligner.calls, name=self.name))
            self.aligner_cache = None
        self.report_filter_stats()
Esempio n. 7
0
    def count(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.counts["barcodes"] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters["chastity"]:
                if not fq.is_chaste():
                    self.filter_stats["chastity"] += 1
                    filter_flags["chastity"] = True
            if self.filters["min quality"] > 0:
                if fq.min_quality() < self.filters["min quality"]:
                    self.filter_stats["min quality"] += 1
                    filter_flags["min quality"] = True
            if self.filters["avg quality"] > 0:
                if fq.mean_quality() < self.filters["avg quality"]:
                    self.filter_stats["avg quality"] += 1
                    filter_flags["avg quality"] = True
            if any(filter_flags.values()):  # failed quality filtering
                self.filter_stats["total"] += 1
                if self.verbose:
                    self.report_filtered_read(self.log, fq, filter_flags)
            else:  # passed quality filtering
                try:
                    self.counts["barcodes"][fq.sequence.upper()] += 1
                except KeyError:
                    self.counts["barcodes"][fq.sequence.upper()] = 1

        self.counts["barcodes"] = pd.DataFrame.from_dict(self.counts["barcodes"], orient="index", dtype="int32")
        if len(self.counts["barcodes"]) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.counts["barcodes"].columns = ["count"]
        self.counts["barcodes"] = self.counts["barcodes"][self.counts["barcodes"]["count"] > self.min_count]
Esempio n. 8
0
    def count(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are reverse-complemented),
        performs quality-based filtering, and 
        counts the variants.
        """
        self.counts['variants'] = dict()

        # flags for verbose output of filtered reads
        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        for fq in read_fastq(self.reads):
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the read based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if not any(filter_flags.values()): # passed quality filtering
                mutations = self.count_variant(fq.sequence)
                if mutations is None: # read has too many mutations
                    self.filter_stats['max mutations'] += 1
                    filter_flags['max mutations'] = True
            if any(filter_flags.values()):
                self.filter_stats['total'] += 1
                if self.verbose:
                    self.report_filtered_read(self.log, fq, filter_flags)

        self.counts['variants'] = \
                pd.DataFrame.from_dict(self.counts['variants'], 
                                       orient="index", dtype="int32")
        if len(self.counts['variants']) == 0:
            raise EnrichError("Failed to count variants", self.name)
        self.counts['variants'].columns = ['count']
Esempio n. 9
0
def trim_fastq(outdir, files, start, end, length, compression):
    """
    """
    if len(files) == 0:
        print("Error: no files provided", file=stderr)
        return

    elif length is not None:
        length_mode = True
        if start is not None and end is not None:
            if (end - start + 1) != length:
                print("Error: start/end/length do not agree", file=stderr)
                return
        elif end is not None: # start is None, so length should equal end
            if length != end:
                print("Error: length/end do not agree", file=stderr)
                return
        elif start is None:
            start = 1
    else:
        length_mode = False
        if start is None:
            start = 1

    for f in files:
        # open the output file
        _, base, ext, _ = split_fastq_path(f)
        outname = base + ".trim" + ext
        outname = os.path.join(outdir, outname)
        handle = create_compressed_outfile(outname, compression)

        # trim the reads and write them
        for read in read_fastq(f):
            if length_mode:
                read.trim_length(length, start)
            else:
                read.trim(start, end)
            print(read, file=handle)

        handle.close()
Esempio n. 10
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.df_dict['barcodes'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        logging.info("Counting barcodes [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()):  # failed quality filtering
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)
            else:  # passed quality filtering
                try:
                    self.df_dict['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.df_dict['barcodes'][fq.sequence.upper()] = 1

        self.df_dict['barcodes'] = \
                pd.DataFrame.from_dict(self.df_dict['barcodes'],
                                       orient="index", dtype="int32")
        if len(self.df_dict['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.df_dict['barcodes'].columns = ['count']
        self.df_dict['barcodes'].sort('count', ascending=False, inplace=True)
        if 'barcodes_low_abundance' in self.df_dict:  # min count is set
            self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][
                self.df_dict['barcodes']['count'] < self.min_count]
            logging.info(
                "Writing counts for {n} unique low-abundance barcodes to disk [{name}]"
                .format(n=len(self.df_dict['barcodes_low_abundance']),
                        name=self.name))
            self.dump_data(keys=['barcodes_low_abundance'])
            self.df_dict['barcodes'] = self.df_dict['barcodes'][
                self.df_dict['barcodes']['count'] >= self.min_count]

        logging.info(
            "Retained counts for {n} barcodes ({u} unique) [{name}]".format(
                n=self.df_dict['barcodes']['count'].sum(),
                u=len(self.df_dict['barcodes'].index),
                name=self.name))
        if not self.barcodevariant:
            self.report_filter_stats()
Esempio n. 11
0
    def calculate(self):
        """
        Reads the forward or reverse FASTQ file (reverse reads are 
        reverse-complemented), performs quality-based filtering, and counts 
        the barcodes.
        """
        self.df_dict['barcodes'] = dict()

        filter_flags = dict()
        for key in self.filters:
            filter_flags[key] = False

        # count all the barcodes
        logging.info("Counting barcodes [{name}]".format(name=self.name))
        for fq in read_fastq(self.reads):
            fq.trim_length(self.bc_length, start=self.bc_start)
            if self.revcomp_reads:
                fq.revcomp()

            for key in filter_flags:
                filter_flags[key] = False

            # filter the barcode based on specified quality settings
            if self.filters['chastity']:
                if not fq.is_chaste():
                    self.filter_stats['chastity'] += 1
                    filter_flags['chastity'] = True
            if self.filters['min quality'] > 0:
                if fq.min_quality() < self.filters['min quality']:
                    self.filter_stats['min quality'] += 1
                    filter_flags['min quality'] = True
            if self.filters['avg quality'] > 0:
                if fq.mean_quality() < self.filters['avg quality']:
                    self.filter_stats['avg quality'] += 1
                    filter_flags['avg quality'] = True
            if any(filter_flags.values()): # failed quality filtering
                self.filter_stats['total'] += 1
                if self.report_filtered:
                    self.report_filtered_read(fq, filter_flags)
            else: # passed quality filtering
                try:
                    self.df_dict['barcodes'][fq.sequence.upper()] += 1
                except KeyError:
                    self.df_dict['barcodes'][fq.sequence.upper()] = 1

        self.df_dict['barcodes'] = \
                pd.DataFrame.from_dict(self.df_dict['barcodes'], 
                                       orient="index", dtype="int32")
        if len(self.df_dict['barcodes']) == 0:
            raise EnrichError("Failed to count barcodes", self.name)
        self.df_dict['barcodes'].columns = ['count']
        self.df_dict['barcodes'].sort('count', ascending=False, inplace=True)
        if 'barcodes_low_abundance' in self.df_dict: # min count is set
            self.df_dict['barcodes_low_abundance'] = self.df_dict['barcodes'][self.df_dict['barcodes']['count'] < self.min_count]
            logging.info("Writing counts for {n} unique low-abundance barcodes to disk [{name}]".format(n=len(self.df_dict['barcodes_low_abundance']), name=self.name))
            self.dump_data(keys=['barcodes_low_abundance'])
            self.df_dict['barcodes'] = self.df_dict['barcodes'][self.df_dict['barcodes']['count'] >= self.min_count]

        logging.info("Retained counts for {n} barcodes ({u} unique) [{name}]".format(
                n=self.df_dict['barcodes']['count'].sum(), u=len(self.df_dict['barcodes'].index), name=self.name))
        if not self.barcodevariant:
            self.report_filter_stats()