def __init__(self, sequence, complement_in=b"ACGT", complement_out=b"TGCA", letters="ACGT"): """.. rubric:: Constructor A sequence is just a string stored in the :attr:`sequence` attribute. It has properties related to the type of alphabet authorised. :param str sequence: May be a string of a Fasta File, in which case only the first sequence is used. :param complement_in: :param complement_out: :param letters: authorise letters. Used in :meth:`check` only. .. todo:: use counter only once as a property """ if sequence.endswith(".fa") or sequence.endswith(".fasta"): fasta = FastA(sequence) sequence = fasta.next().sequence.upper() else: # assume correct string sequence pass self._data = sequence try: self._translate = string.maketrans(complement_in, complement_out) except: self._translate = bytes.maketrans(complement_in, complement_out) self._letters = letters
def extract_fasta(self, fastafile, features=['rRNA']): types = self.get_types() for feature in features: if feature not in types: raise ValueError("{} not found".format(feature)) # fasta may have several contig/chromosome names # the gene bank should be compatible !! fasta = FastA(fastafile) contig_names = fasta.get_lengths_as_dict() # most of the times, the version is not in the gbk contig_names = [x.split(".")[0] for x in contig_names] # then we read the features from the genbank records = self.genbank_features_parser() contig_names_gbk = list(records.keys()) # FIXME FastA is not very efficient for eukaryotes but is enough for now output = "" for name in records.keys(): if name not in contig_names: logger.warning( "{} contig from genbank not found in fasta".format(name)) continue index = contig_names.index(name) sequence = fasta.sequences[index] for item in records[name]: if item['type'] in features: start, end = item['gene_start'], item['gene_end'] try: info = item['product'] output += ">{}_{}_{}_{} {}\n".format( name, item['type'], start, end, info) except: output += ">{}_{}_{}_{} {}\n".format( name, item['type'], start, end) output += "{}\n".format(sequence[start:end]) return output
def __init__(self, filename): """.. rubric:: Constructor :param str filename: the input FASTA file """ if isinstance(filename, str): # this is not large files so we load in memory all sequences/names # and comments once for all. This has also the adavantage that # data can now be changed on the fly fasta = FastA(filename) self._data = [self._to_read(this) for this in fasta] elif isinstance(filename, AdapterReader): self._data = [self._to_read(this) for this in filename._data] elif isinstance(filename, list): self._data = [self._to_read(this) for this in filename] self._sanity_check()
def __init__(self, filename_fasta, merge=False, name=None): """.. rubric:: Constructor Input must be a fasta file with valid DNA or RNA characters :param str filename_fasta: a Fasta file, only the first sequence is used ! :param int threshold: Minimal length of repeat to output :param str name: if name is provided, scan the Fasta file and select the corresponding sequence. if you want to analyse all sequences, you need to use a loop by setting _header for each sequence with the sequence name found in sequence header. .. note:: known problems. Header with a > character (e.g. in the comment) are left strip and only the comments is kept. Another issue is for multi-fasta where one sequence is ignored (last or first ?) """ # used to check everything is fine with the header/name self._fasta = FastA(filename_fasta) # Define the attributes, and set the header if already provided self._threshold = None self._df_shustring = None self._header = None self._length = None self._longest_shustring = None self._begin_end_repeat_position = None self._begin_end_repeat_position_merge = None self._filename_fasta = filename_fasta self._previous_thr = None self._list_len_repeats = None self._contig_names = None if not isinstance(merge, bool): raise TypeError("do_merge must be boolean") self._do_merge = merge if name is not None: self.header = name else: self.header = self._fasta.names[0]
class Sequence(object): """Abstract base classe for other specialised sequences such as DNA. Sequenced is the base class for other classes such as :class:`DNA` and :class:`RNA`. :: from sequana import Sequence s = Sequence("ACGT") s.stats() s.get_complement() .. note:: You may use a Fasta file as input (see constructor) """ def __init__(self, sequence, complement_in=b"ACGT", complement_out=b"TGCA", letters="ACGT"): """.. rubric:: Constructor A sequence is just a string stored in the :attr:`sequence` attribute. It has properties related to the type of alphabet authorised. :param str sequence: May be a string of a Fasta File, in which case only the first sequence is used. :param complement_in: :param complement_out: :param letters: authorise letters. Used in :meth:`check` only. .. todo:: use counter only once as a property """ if sequence.endswith(".fa") or sequence.endswith(".fasta"): self.fasta = FastA(sequence) sequence = self.fasta.next().sequence.upper() else: # assume correct string sequence pass self._data = sequence try: self._translate = string.maketrans(complement_in, complement_out) except: self._translate = bytes.maketrans(complement_in, complement_out) self._letters = letters def __iter__(self): return self def __next__(self): self._data = self.fasta.next().sequence.upper() return self._data def _get_sequence(self): return self._data sequence = property(_get_sequence) def get_complement(self): """Return complement """ return self._data.translate(self._translate) def get_reverse_complement(self): """Return reverse complement """ return self.get_complement()[::-1] def get_reverse(self): """Return reverse sequence""" return self._data[::-1] def complement(self): """Alias to :meth:`get_complement`""" self._data = self.get_complement() def reverse(self): """Alias to :meth:`get_reverse`""" self._data = self.get_reverse() def reverse_complement(self): """Alias to get_reverse_complement""" self._data = self.get_reverse_complement() def check(self): """Check that all letters are valid""" counter = Counter(self._data).keys() for key in counter: if key not in self._letters: raise ValueError( "Found unexpected letter in the sequence (%s)" % key) def __len__(self): return len(self._data) def gc_content(self): """Return mean GC content""" c = Counter(self._data) ratio = (c['G'] + c['C']) / len(self.sequence) return ratio def stats(self): """Return basic stats about the number of letters""" from collections import Counter return Counter(self.sequence) def get_occurences(self, pattern, overlap=False): """Return position of the input pattern in the sequence :: >>> from sequana import Sequence >>> s = Sequence('ACGTTTTACGT') >>> s.get_occurences("ACGT") [0, 7] """ if overlap is False: res = [m.start() for m in re.finditer(pattern, self.sequence)] elif overlap is True: res = [ m.start() for m in re.finditer('(?=%s)' % pattern, self.sequence) ] return res
def summary(**kwargs): """Create a HTML report for various type of NGS formats. \b * bamqc * fastq This will process all files in the given pattern (in back quotes) sequentially and procude one HTML file per input file. Other module all work in the same way. For example, for FastQ files:: sequana summary one_input.fastq sequana summary `ls *fastq` """ names = kwargs['name'] module = kwargs['module'] if module is None: if names[0].endswith('fastq.gz') or names[0].endswith('.fastq'): module = "fastq" elif names[0].endswith('.bam'): module = "bam" elif names[0].endswith('.gff') or names[0].endswith('gff3'): module = "gff" elif names[0].endswith('fasta.gz') or names[0].endswith('.fasta'): module = "fasta" else: logger.error( "please use --module to tell us about the input fimes") sys.exit(1) if module == "bamqc": for name in names: print(f"Processing {name}") from sequana.modules_report.bamqc import BAMQCModule report = BAMQCModule(name, "bamqc.html") elif module == "fasta": # there is no module per se. HEre we just call FastA.summary() from sequana.fasta import FastA for name in names: f = FastA(name) f.summary() elif module == "fastq": # there is no module per se. HEre we just call FastA.summary() from sequana.fastq import FastQ from sequana import FastQC for filename in names: ff = FastQC(filename, max_sample=1e6, verbose=False) stats = ff.get_stats() print(stats) elif module == "bam": import pandas as pd from sequana import BAM for filename in names: ff = BAM(filename) stats = ff.get_stats() df = pd.Series(stats).to_frame().T print(df) elif module == "gff": import pandas as pd from sequana import GFF3 for filename in names: ff = GFF3(filename) print("#filename: {}".format(filename)) print("#Number of entries per genetic type:") print(ff.df.value_counts('type').to_string()) print("#Number of duplicated attribute (if any) per attribute:") ff.get_duplicated_attributes_per_type()