Beispiel #1
0
def readfx(fastx):
    if not file_exists(fastx):
        logger.critical("File Not Found: %s" % fastx)
        raise IOError(2, "No such file:", fastx)

    fx = ""
    try:
        fx = FastxFile(fastx)
        for f in fx:
            yield f.name, f.sequence, f.quality
    finally:
        if fx:
            fx.close()
Beispiel #2
0
def readfx(fastx):
    """FASTX file reader.

    Args:
        fastx (str): file path to fasta or fastq file; supports gzip compressed files

    Yields:
        tuple: The tuple consists of the read name, the sequence, and quality scores if the file
            is a fastq.

    Raises:
        IOError: If `fastx` file does not exist.

    """
    if not os.path.exists(fastx):
        raise IOError(2, "No such file:", fastx)
    fx = ""
    try:
        fx = FastxFile(fastx)
        for f in fx:
            yield f.name, f.sequence, f.quality
    finally:
        if fx:
            fx.close()
Beispiel #3
0
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):  #pragma: no cover
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self.filename = filename
        self._N = None

    def __iter__(self):
        return self

    def __next__(self):  # python 3
        return self.next()

    def next(self):  # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:  #pragma: no cover
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration

    def __len__(self):
        if self._N is None:
            logger.info("Reading input fasta file...please wait")
            self._N = len([x for x in FastxFile(self.filename)])
        return self._N

    def _get_names(self):
        return [this.name for this in self]

    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]

    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]

    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]

    lengths = property(_get_lengths)

    def get_lengths_as_dict(self):
        return dict(zip(self.names, self.lengths))

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):  #pragma: no cover
                os.makedirs(output_dir)
        except FileNotFoundError:  #pragma: no cover
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([
                    contigs.sequence[i:min(i + 80, len(contigs.sequence))]
                    for i in range(0, len(contigs.sequence), 80)
                ]) + "\n"
                fp.write(name + sequence)
                n += 1

    def filter(self,
               output_filename,
               names_to_keep=None,
               names_to_exclude=None):
        if names_to_exclude is None and names_to_keep is None:  #pragma: no cover
            logger.warning("No ids provided")
            return

        if names_to_exclude:
            with open(self.filename) as fin:
                with open(output_filename, "w") as fout:
                    skip = False
                    # do no use readlines. may be slower but may cause memory
                    # issue
                    for line in fin:
                        if line.startswith(">"):
                            if line[1:].split()[0] in names_to_exclude:
                                skip = True
                            else:
                                skip = False
                        if skip is False:
                            fout.write(line)
        elif names_to_keep:
            with open(self.filename) as fin:
                with open(output_filename, "w") as fout:
                    # do no use readlines. may be slower but may cause memory
                    # issue
                    skip = True
                    for line in fin:
                        if line.startswith(">"):
                            if line[1:].split()[0] in names_to_keep:
                                skip = False
                            else:
                                skip = True
                        if skip is False:
                            fout.write(line)

    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN)  # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i + 1)
        return cherries

    def get_stats(self):
        from pylab import mean
        stats = {}
        stats["N"] = len(self.sequences)
        stats["mean_length"] = mean(self.lengths)
        stats["total_length"] = sum(self.lengths)
        from sequana.stats import N50, L50
        stats["N50"] = N50(self.lengths)
        stats["L50"] = L50(self.lengths)
        stats["min_length"] = min(self.lengths)
        stats["max_length"] = max(self.lengths)
        return stats

    def summary(self, max_contigs=-1):
        from pylab import mean, argmax
        # used by sequana summary fasta
        summary = {"number_of_contigs": len(self.sequences)}
        summary["total_contigs_length"] = sum(self.lengths)
        summary["mean_contig_length"] = mean(self.lengths)
        summary["max_contig_length"] = max(self.lengths)
        summary["min_contig_length"] = min(self.lengths)
        N = 0
        lengths = self.lengths[:]
        positions = list(range(len(lengths)))
        stats = self.get_stats()
        print("#sample_name: {}".format(self.filename))
        print("#total length: {}".format(stats['total_length']))
        print("#N50: {}".format(stats['N50']))
        print("#Ncontig: {}".format(stats['N']))
        print("#L50: {}".format(stats['L50']))
        print("#max_contig_length: {}".format(stats['max_length']))
        print("#min_contig_length: {}".format(stats['min_length']))
        print("#mean_contig_length: {}".format(stats['mean_length']))

        print("contig name,length,count A,C,G,T,N")
        if max_contigs == -1:
            max_contigs = len(lengths) + 1
        while lengths and N < max_contigs:
            N += 1
            index = argmax(lengths)
            length = lengths.pop(index)
            position = positions.pop(index)
            sequence = self.sequences[position]
            name = self.names[position]
            print("{},{},{},{},{},{},{}".format(name, length,
                                                sequence.count('A'),
                                                sequence.count('C'),
                                                sequence.count('G'),
                                                sequence.count('T'),
                                                sequence.count('N')))

    def GC_content_sequence(self, sequence):
        GC = sequence.count('G') + sequence.count('g')
        GC += sequence.count('C') + sequence.count('c')
        return GC / len(sequence) * 100

    def GC_content(self):
        lengths = sum(self.lengths)
        GC = 0
        for seq in self.sequences:
            GC += seq.count('G') + seq.count('g')
            GC += seq.count('C') + seq.count('c')
        return GC / lengths * 100

    def reverse_and_save(self, filename):
        with open(filename, "w") as fout:
            for read in self:
                fout.write(">{}\t{}\n{}\n".format(read.name, read.comment,
                                                  read.sequence[::-1]))

    def save_ctg_to_fasta(self, ctgname, outname, max_length=-1):
        index = self.names.index(ctgname)
        with open("{}.fa".format(outname), "w") as fout:

            if max_length == -1:
                fout.write(">{}\n{}".format(outname, self.sequences[index]))
            else:
                fout.write(">{}\n{}".format(
                    outname, self.sequences[index][0:max_length]))

    def to_fasta(self, outfile, width=80):
        """Save the input FastA file into a new file

        The interest of this method is to wrap the sequence into 80 characters.
        This is useful if the input file is not formatted correctly.

        """
        with open(outfile, "w") as fout:
            for name, comment, seq in zip(self.names, self.comments,
                                          self.sequences):
                import textwrap
                seq = "\n".join(textwrap.wrap(seq, width))
                if comment is None:
                    fout.write(">{}\n{}\n".format(name, seq))
                else:
                    fout.write(">{}\t{}\n{}\n".format(name, comment, seq))

    def to_igv_chrom_size(self, output):
        data = self.get_lengths_as_dict()
        with open(output, "w") as fout:
            for k, v in data.items():
                fout.write("{}\t{}\n".format(k, v))
Beispiel #4
0
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self.filename = filename
        logger.info("Reading input fasta file...please wait") 
        self._N = len([x for x in FastxFile(filename)])

    def __iter__(self):
        return self

    def __next__(self): # python 3
        return self.next()

    def next(self): # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration
        return d

    def __len__(self):
        return self._N

    def _get_names(self):
        return [this.name for this in self]
    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]
    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]
    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]
    lengths = property(_get_lengths)

    def get_lengths_as_dict(self):
        return dict(zip(self.names, self.lengths))

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([contigs.sequence[i:min(i+80, 
                    len(contigs.sequence))] for i in range(0, 
                    len(contigs.sequence), 80)]) + "\n"
                fp.write(name + sequence)
                n += 1

    def select_random_reads(self, N=None, output_filename="random.fasta"):
        """Select random reads and save in a file

        :param int N: number of random unique reads to select
            should provide a number but a list can be used as well.
        :param str output_filename:
        """
        import numpy as np
        thisN = len(self)
        if isinstance(N, int):
            if N > thisN:
                N = thisN
            # create random set of reads to pick up
            cherries = list(range(thisN))
            np.random.shuffle(cherries)
            # cast to set for efficient iteration
            cherries = set(cherries[0:N])
        elif isinstance(N, set):
            cherries = N
        elif isinstance(N, list):
            cherries = set(N)
        fasta = FastxFile(self.filename)
        pb = Progress(thisN) # since we scan the entire file
        with open(output_filename, "w") as fh:
            for i, read in enumerate(fasta):
                if i in cherries:
                    fh.write(read.__str__() + "\n")
                else:
                    pass
                pb.animate(i+1)
        return cherries

    def get_stats(self):
        stats = {}
        stats["N"] = 2
        stats["mean_length"] = mean(self.lengths)
        return stats
Beispiel #5
0
class FastA(object):
    """Class to handle FastA files. Cannot be compressed


    """
    def __init__(self, filename, verbose=False):
        if filename.endswith(".gz"):
            raise ValueError("Must be decompressed.")
        self._fasta = FastxFile(filename)
        self._N = len([x for x in FastxFile(filename)])

    def __iter__(self):
        return self

    def __next__(self):  # python 3
        return self.next()

    def next(self):  # python 2
        # reads 4 lines
        try:
            d = next(self._fasta)
            return d
        except KeyboardInterrupt:
            # This should allow developers to break a loop that takes too long
            # through the reads to run forever
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
        except:
            self._fasta.close()
            self._fasta = FastxFile(self._fasta.filename)
            raise StopIteration
        return d

    def __len__(self):
        return self._N

    def _get_names(self):
        return [this.name for this in self]

    names = property(_get_names)

    def _get_sequences(self):
        return [this.sequence for this in self]

    sequences = property(_get_sequences)

    def _get_comment(self):
        return [this.comment for this in self]

    comments = property(_get_comment)

    def _get_lengths(self):
        return [len(this.sequence) for this in self]

    lengths = property(_get_lengths)

    def format_contigs_denovo(self, output_file, len_min=500):
        """Replace NODE with the project name and remove contigs with a length 
        lower than len_min.

        :param str output_file: output file name.
        :param int len_min: minimal length of contigs.

        Example:

            from sequana import FastA

            contigs = FastA("denovo_assembly.fasta")
            contigs.format_contigs_denovo("path/to/file.fasta", len_min=500)

        Results are stored in "path/to/file.fasta".
        """
        # catch basename of file without extension
        project = os.path.basename(output_file).split(".")[0]
        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        n = 1
        with open(output_file, "w") as fp:
            for contigs in self:
                if len(contigs.sequence) < len_min:
                    break
                name = ">{}_{} {}\n".format(project, n, contigs.name)
                sequence = "\n".join([
                    contigs.sequence[i:min(i + 80, len(contigs.sequence))]
                    for i in range(0, len(contigs.sequence), 80)
                ]) + "\n"
                fp.write(name + sequence)
                n += 1