Beispiel #1
0
    def __init__(self,
                 sequence,
                 complement_in=b"ACGT",
                 complement_out=b"TGCA",
                 letters="ACGT"):
        """.. rubric:: Constructor

        A sequence is just a string stored in the :attr:`sequence` attribute. It
        has properties related to the type of alphabet authorised.

        :param str sequence: May be a string of a Fasta File, in which case only
            the first sequence is used.
        :param complement_in:
        :param complement_out:
        :param letters: authorise letters. Used in :meth:`check` only.

        .. todo:: use counter only once as a property

        """
        if sequence.endswith(".fa") or sequence.endswith(".fasta"):
            fasta = FastA(sequence)
            sequence = fasta.next().sequence.upper()
        else:  # assume correct string sequence
            pass

        self._data = sequence
        try:
            self._translate = string.maketrans(complement_in, complement_out)
        except:
            self._translate = bytes.maketrans(complement_in, complement_out)
        self._letters = letters
Beispiel #2
0
    def __init__(self, sequence, complement_in=b"ACGT", complement_out=b"TGCA",
                 letters="ACGT"):
        """.. rubric:: Constructor

        A sequence is just a string stored in the :attr:`sequence` attribute. It
        has properties related to the type of alphabet authorised.

        :param str sequence: May be a string of a Fasta File, in which case only
            the first sequence is used.
        :param complement_in:
        :param complement_out:
        :param letters: authorise letters. Used in :meth:`check` only.

        .. todo:: use counter only once as a property

        """
        if sequence.endswith(".fa") or sequence.endswith(".fasta"):
            fasta = FastA(sequence)
            sequence = fasta.next().sequence.upper()
        else: # assume correct string sequence
            pass

        self._data = sequence
        try:
            self._translate = string.maketrans(complement_in, complement_out)
        except:
            self._translate = bytes.maketrans(complement_in, complement_out)
        self._letters = letters
Beispiel #3
0
    def extract_fasta(self, fastafile, features=['rRNA']):
        types = self.get_types()
        for feature in features:
            if feature not in types:
                raise ValueError("{} not found".format(feature))

        # fasta may have several contig/chromosome names
        # the gene bank should be compatible !!
        fasta = FastA(fastafile)
        contig_names = fasta.get_lengths_as_dict()

        # most of the times, the version is not in the gbk
        contig_names = [x.split(".")[0] for x in contig_names]

        # then we read the features from the genbank
        records = self.genbank_features_parser()
        contig_names_gbk = list(records.keys())

        # FIXME FastA is not very efficient for eukaryotes but is enough for now

        output = ""
        for name in records.keys():
            if name not in contig_names:
                logger.warning(
                    "{} contig from genbank not found in fasta".format(name))
                continue
            index = contig_names.index(name)
            sequence = fasta.sequences[index]

            for item in records[name]:
                if item['type'] in features:
                    start, end = item['gene_start'], item['gene_end']
                    try:
                        info = item['product']
                        output += ">{}_{}_{}_{} {}\n".format(
                            name, item['type'], start, end, info)
                    except:
                        output += ">{}_{}_{}_{} {}\n".format(
                            name, item['type'], start, end)
                    output += "{}\n".format(sequence[start:end])
        return output
Beispiel #4
0
    def __init__(self, filename):
        """.. rubric:: Constructor

        :param str filename: the input FASTA file
        """
        if isinstance(filename, str):
            # this is not large files so we load in memory all sequences/names
            # and comments once for all. This has also the adavantage that
            # data can now be changed on the fly
            fasta = FastA(filename)
            self._data = [self._to_read(this) for this in fasta]
        elif isinstance(filename, AdapterReader):
            self._data = [self._to_read(this) for this in filename._data]
        elif isinstance(filename, list):
            self._data = [self._to_read(this) for this in filename]

        self._sanity_check()
Beispiel #5
0
    def __init__(self, filename_fasta, merge=False, name=None):
        """.. rubric:: Constructor

        Input must be a fasta file with valid DNA or RNA characters

        :param str filename_fasta: a Fasta file, only the first
            sequence is used !
        :param int threshold: Minimal length of repeat to output
        :param str name: if name is provided, scan the Fasta file
            and select the corresponding sequence. if you want to
            analyse all sequences, you need to use a loop by setting
            _header for each sequence with the sequence name found in
            sequence header.


        .. note:: known problems. Header with a > character (e.g. in the
            comment) are left strip and only the comments is kept. Another issue
            is for multi-fasta where one sequence is ignored (last or first ?)

        """
        # used to check everything is fine with the header/name
        self._fasta = FastA(filename_fasta)

        # Define the attributes, and set the header if already provided
        self._threshold = None
        self._df_shustring = None
        self._header = None
        self._length = None
        self._longest_shustring = None
        self._begin_end_repeat_position = None
        self._begin_end_repeat_position_merge = None
        self._filename_fasta = filename_fasta
        self._previous_thr = None
        self._list_len_repeats = None
        self._contig_names = None
        if not isinstance(merge, bool):
            raise TypeError("do_merge must be boolean")
        self._do_merge = merge
        if name is not None:
            self.header = name
        else:
            self.header = self._fasta.names[0]
Beispiel #6
0
class Sequence(object):
    """Abstract base classe for other specialised sequences such as DNA.


    Sequenced is the base class for other classes such as :class:`DNA` and
    :class:`RNA`.

    ::

        from sequana import Sequence
        s = Sequence("ACGT")
        s.stats()
        s.get_complement()

    .. note:: You may use a Fasta file as input (see constructor)


    """
    def __init__(self,
                 sequence,
                 complement_in=b"ACGT",
                 complement_out=b"TGCA",
                 letters="ACGT"):
        """.. rubric:: Constructor

        A sequence is just a string stored in the :attr:`sequence` attribute. It
        has properties related to the type of alphabet authorised.

        :param str sequence: May be a string of a Fasta File, in which case only
            the first sequence is used.
        :param complement_in:
        :param complement_out:
        :param letters: authorise letters. Used in :meth:`check` only.

        .. todo:: use counter only once as a property

        """
        if sequence.endswith(".fa") or sequence.endswith(".fasta"):
            self.fasta = FastA(sequence)
            sequence = self.fasta.next().sequence.upper()
        else:  # assume correct string sequence
            pass

        self._data = sequence
        try:
            self._translate = string.maketrans(complement_in, complement_out)
        except:
            self._translate = bytes.maketrans(complement_in, complement_out)
        self._letters = letters

    def __iter__(self):
        return self

    def __next__(self):
        self._data = self.fasta.next().sequence.upper()
        return self._data

    def _get_sequence(self):
        return self._data

    sequence = property(_get_sequence)

    def get_complement(self):
        """Return complement """
        return self._data.translate(self._translate)

    def get_reverse_complement(self):
        """Return reverse complement """
        return self.get_complement()[::-1]

    def get_reverse(self):
        """Return reverse sequence"""
        return self._data[::-1]

    def complement(self):
        """Alias to :meth:`get_complement`"""
        self._data = self.get_complement()

    def reverse(self):
        """Alias to :meth:`get_reverse`"""
        self._data = self.get_reverse()

    def reverse_complement(self):
        """Alias to get_reverse_complement"""
        self._data = self.get_reverse_complement()

    def check(self):
        """Check that all letters are valid"""
        counter = Counter(self._data).keys()
        for key in counter:
            if key not in self._letters:
                raise ValueError(
                    "Found unexpected letter in the sequence (%s)" % key)

    def __len__(self):
        return len(self._data)

    def gc_content(self):
        """Return mean GC content"""
        c = Counter(self._data)
        ratio = (c['G'] + c['C']) / len(self.sequence)
        return ratio

    def stats(self):
        """Return basic stats about the number of letters"""
        from collections import Counter
        return Counter(self.sequence)

    def get_occurences(self, pattern, overlap=False):
        """Return position of the input pattern in the sequence

        ::

            >>> from sequana import Sequence
            >>> s = Sequence('ACGTTTTACGT')
            >>> s.get_occurences("ACGT")
            [0, 7]

        """
        if overlap is False:
            res = [m.start() for m in re.finditer(pattern, self.sequence)]
        elif overlap is True:
            res = [
                m.start()
                for m in re.finditer('(?=%s)' % pattern, self.sequence)
            ]
        return res
Beispiel #7
0
def summary(**kwargs):
    """Create a HTML report for various type of NGS formats.

    \b
    * bamqc
    * fastq

    This will process all files in the given pattern (in back quotes)
    sequentially and procude one HTML file per input file.


    Other module all work in the same way. For example, for FastQ files::

        sequana summary one_input.fastq
        sequana summary `ls *fastq` 


    """
    names = kwargs['name']
    module = kwargs['module']

    if module is None:
        if names[0].endswith('fastq.gz') or names[0].endswith('.fastq'):
            module = "fastq"
        elif names[0].endswith('.bam'):
            module = "bam"
        elif names[0].endswith('.gff') or names[0].endswith('gff3'):
            module = "gff"
        elif names[0].endswith('fasta.gz') or names[0].endswith('.fasta'):
            module = "fasta"
        else:
            logger.error(
                "please use --module to tell us about the input fimes")
            sys.exit(1)

    if module == "bamqc":
        for name in names:
            print(f"Processing {name}")
            from sequana.modules_report.bamqc import BAMQCModule
            report = BAMQCModule(name, "bamqc.html")
    elif module == "fasta":  # there is no module per se. HEre we just call FastA.summary()
        from sequana.fasta import FastA
        for name in names:
            f = FastA(name)
            f.summary()
    elif module == "fastq":  # there is no module per se. HEre we just call FastA.summary()
        from sequana.fastq import FastQ
        from sequana import FastQC
        for filename in names:
            ff = FastQC(filename, max_sample=1e6, verbose=False)
            stats = ff.get_stats()
            print(stats)
    elif module == "bam":
        import pandas as pd
        from sequana import BAM
        for filename in names:
            ff = BAM(filename)
            stats = ff.get_stats()
            df = pd.Series(stats).to_frame().T
            print(df)
    elif module == "gff":
        import pandas as pd
        from sequana import GFF3
        for filename in names:
            ff = GFF3(filename)
            print("#filename: {}".format(filename))
            print("#Number of entries per genetic type:")
            print(ff.df.value_counts('type').to_string())
            print("#Number of duplicated attribute (if any) per attribute:")
            ff.get_duplicated_attributes_per_type()