def __init__(self, seqName, seqData, start_coord=None, end_coord=None, strand="+", remaining=0, meta_data=None, useMutableString=False): """ Constructor for Sequence objects. See class level documentation for parameter descriptions. """ if strand != "+" and strand != "-": raise ValueError("Sequence strand must be either + or -, found " + strand + " instead") self.name = seqName if useMutableString: self.sequenceData = MutableString(seqData) else: self.sequenceData = seqData self.mutableString = useMutableString self.remaining = remaining self._start_coord = start_coord self._end_coord = end_coord self.strand = strand self._ungapped_len = None # we compute this just-in-time.. self._effective_len = None # .. and this self.meta_data = meta_data if meta_data is not None else {}
class Sequence(object): """ This is the base class for all sequences in Pyokit. Objects from this class will have only a sequence name and actual nucleotide sequence data. :param seqName: A name describing the sequence. Can be any string. :param seqData: The nucleotide sequence data. Can be DNA or RNA. Note that there is no check to make sure the sequence data is valid, that's the responsibility of the caller. :param start_coord: TODO :param end_coord: TODO :param strand: By default, this is +, but can also be set to - to indicate that this sequence is a reverse complement. :param remaining: the amount of sequence that comes after this; 0 if this is the whole sequence. Alterntively, you might think of this as the negative strand coordinates of the end of this sequence. :param meta_data: dictionary containing meta-data key-value pairs :param useMutableString: Store the sequence data as a mutable string, rather than a regular python string. This should make editing operations must faster, but it comes at the expense of less flexibility (e.g. the object can not be used as a hash key because it is mutable.) """ def __init__(self, seqName, seqData, start_coord=None, end_coord=None, strand="+", remaining=0, meta_data=None, useMutableString=False): """ Constructor for Sequence objects. See class level documentation for parameter descriptions. """ if strand != "+" and strand != "-": raise ValueError("Sequence strand must be either + or -, found " + strand + " instead") self.name = seqName if useMutableString: self.sequenceData = MutableString(seqData) else: self.sequenceData = seqData self.mutableString = useMutableString self.remaining = remaining self._start_coord = start_coord self._end_coord = end_coord self.strand = strand self._ungapped_len = None # we compute this just-in-time.. self._effective_len = None # .. and this self.meta_data = meta_data if meta_data is not None else {} def copy(self): """ Copy constructor for Sequence objects. """ return Sequence(self.name, self.sequenceData, self.start, self.end, self.strand, self.remaining, self.meta_data, self.mutableString) @property def start(self): """ :return: The coordinate of the first nucleotide in this sequence; by convention, we call this coordinate 1 if no other value was provided. """ if self._start_coord is None: return 1 return self._start_coord @property def end(self): """ :return: The coordinate of the end of this sequence; as with all other indexing of sequences in pyokit, sequences are not inclusive of their last index. Computed just-in-time from the ungapped sequence length if it wasn't provided at construction time. """ if self._end_coord is None: return self.ungapped_len + 1 return self._end_coord @property def ungapped_len(self): if self._ungapped_len is None: self._ungapped_len = 0 for nuc in self.sequenceData: if nuc != GAP_CHAR: self._ungapped_len += 1 # take this oportunity to check that coords match ungapped sequence len e_ok = self._end_coord is not None if e_ok and self._ungapped_len != self.end - self.start: msg = ("ungapped length (" + str(self._ungapped_len) + ") of sequence " + "doesn't match start (" + str(self.start) + ") and end (" + str(self.end) + ") coordinates") raise SequenceError(msg) return self._ungapped_len @property def effective_len(self): """ Get the length of the sequence if N's are disregarded. """ if self._effective_len is None: self._effective_len = len([nuc for nuc in self.sequenceData if nuc != "N" and nuc != "n"]) return self._effective_len def __len__(self): """ Get the length of the sequence, defined as the length of its sequence data """ return len(self.sequenceData) def __getitem__(self, i): return self.sequenceData[i] def subsequence(self, start, end): """ Extract a subsequence from this sequence object using absolute coordinates that exist in the same coordinate space as the sequence itself. For example: 46 --> A--CTGC-TAGC-GATCGACT <-- 62 subsequence(47,52) == CTGC-T :param start: the index marking the start (inclusive) of the subsequence. This is a one-based index, and is in the same coordinate space as this sequence object. :param end: the index marking the end (exclusive) of the subsequence. This is a one-based index, and is in the same coordinate space as this sequence object. :return: a new sequence object that represents the subsequence of this from position start (indexed from 1, inclusive) to end (indexed from 1, exclusive). Ungapped length of this will always be equal to end - start :rasie SequenceError: if the coordinates given fall outside of the start and end indices of this sequence object. """ if (start < self.start or start >= self.end or end <= self.start or end > self.end or start >= end): raise SequenceError("invalid subsequence coordinates (" + str(start) + " ," + str(end) + ") for sequence " + str(self)) rel_start_string_coord = self.start - start - 1 rel_end_string_coord = self.end - end - 1 seq = self.sequenceData[rel_start_string_coord:rel_end_string_coord] return Sequence(self.name, seq, start, end, self.strand, self.remaining + self.end - end, self.mutableString) def relative_subsequence(self, start, end): """ Extract a subsequence from this sequence using coordinates that are relative to the start (relative position 1) and end coordinates of the sequence. For example: 46 --> A--CTGC-TAGC-GATCGACT <-- 62 subsequence(2,7) == CTGC-T :param start: the index marking the start (inclusive) of the subsequence. This is a one-based index, and is in the coordinate space of the sequence (i.e. from 1 to N, where N is the number of non-gap nucleotides in the sequence) :param end: the index marking the end (exclusive) of the subsequence. This is a one-based index, and is in the coordinate space of the sequence (i.e. from 1 to N, where N is the number of non-gap nucleotides in the sequence) :return: a new sequence object that represents the subsequence :rasie SequenceError: if the start coordinate is less than 1 or the end coordinate is greater than the ungapped length of this sequence. """ if start < 1: raise SequenceError("invalid start coordinate for subsequence: " + str(start)) if end > self.ungapped_len: raise SequenceError("invalid end coordinate for subsequence: " + str(end) + " greater than number of non-gap " + "nucleotides in sequence (" + self.ungapped_len + ")") raise SequenceError("method not implemented") def gapped_relative_subsequence(self, start, end): """ Extract a subsequence from this sequence using coordinates that are relative to the start of the sequence (relative position 1) and the number of nuceltodies in the sequence, including gaps. For example: 46 --> A--CTGC-TAGC-GATCGACT <-- 62 subsequence(2,7) == --CTG """ if start < 1: msg = "invalid start coordinate for subsequence: " + str(start) raise InvalidSequenceCoordinatesError(msg) if end > len(self.sequenceData) + 1: msg = "invalid end coordinate for subsequence: " + str(end) +\ " greater than length of sequence (" + str(len(self)) + ")" raise InvalidSequenceCoordinatesError(msg) non_gaps_before = start - 1 - self.sequenceData[:start - 1].count(GAP_CHAR) seq = self.sequenceData[start - 1:end - 1] non_gaps_in = len(seq) - seq.count(GAP_CHAR) non_gaps_after = self.ungapped_len - non_gaps_before - non_gaps_in new_start = self.start + non_gaps_before new_end = new_start + non_gaps_in new_remaining = self.remaining + non_gaps_after return Sequence(self.name, seq, new_start, new_end, self.strand, new_remaining, self.meta_data, self.mutableString) def is_positive_strand(self): """ """ return self.strand == "+" def percentNuc(self, nuc): """ return the percentage of the sequence which is equal to the passed nuc. :param nuc: the nucleotide to compute percentage composition for. There is no check to make sure this is a valid nucleotide. :return: the percentage of the sequence that is <nuc> """ count = reduce(lambda x, y: x + 1 if y == nuc else x, self.sequenceData, 0) return count / float(len(self.sequenceData)) def similarity(self, self_start, self_end, other_start, other_end, other): """ Compute the number of matching bases in the subsequences self[start, end] and other[o_start, o_end]. Note that the subsequences must be the same length. :param self_start: start index for sub-sequence in self :param self_end: end index for sub-sequence in self :param other_start: start index for subsequence in other sequence :param other_end: end index for subsequence in other sequence :param other: other sequence to compare to this. """ assert(self_end - self_start == other_end - other_start) count = 0 for i in range(0, self_end - self_start + 1): if (self.sequenceData[self_start + i] == other.sequenceData[other_start + i]): count += 1 return count def reverseComplement(self, isRNA=None): """ Reverse complement this sequence in-place. :param isRNA: if True, treat this sequence as RNA. If False, treat it as DNA. If None (default), inspect the sequence and make a guess as to whether it is RNA or DNA. """ isRNA_l = self.isRNA() if isRNA is None else isRNA tmp = "" for n in self.sequenceData: if isRNA_l: tmp += RNA_COMPLEMENTS[n] else: tmp += DNA_COMPLEMENTS[n] self.sequenceData = tmp[::-1] def __eq__(self, seq): """ Check wheter this sequence is equal to another sequence. Sequences are equal if they have the same name, nucleotide sequence, coordinates, strand and meta data. :param seq: the other sequence to compare against. :return: true if this sequence is equal to passed parameter, else false. """ if seq is None: return False return (self.sequenceData == seq.sequenceData and self.name == seq.name and self.meta_data == seq.meta_data and self.start == seq.start and self.end == seq.end and self.remaining == seq.remaining) def __ne__(self, read): """ Check wheter this sequence is not equal to another sequence. Sequences are equal if they have the same name and nucleotide sequence. :param seq: the other sequence to compare against. :return: true if this sequence is not equal to passed param., else false. """ if read is None: return True return (self.sequenceData != read.sequenceData or self.name != read.name) def nsLeft(self, amount): """ Replace leftmost <amount> bases by Ns. """ self.sequenceData = (amount * "N") + self.sequenceData[amount:] def nsRight(self, amount): """ Replace rightmost <amount> bases by Ns """ self.sequenceData = self.sequenceData[:-amount] + (amount * "N") def maskRegion(self, region): """ Replace nucleotides in this sequence in the regions given by Ns :param region: any object with .start and .end attributes. Co-ords are zero based and inclusive of both end points. Any other attributes (e.g. chrom.) are ignored. :raise SequenceError: if region specifies nucleotides not present in this sequence """ if region.start < 0 or region.end < 0 or \ region.start > len(self) or region.end > len(self): raise SequenceError("cannot mask region " + str(region.start) + " to " + str(region.end) + " in " + self.name + ". " + "Region specifies nucleotides not present in " + "this read. Valid range would have been 0 -- " + str(len(self))) if self.mutableString: for i in range(region.start, region.end + 1): self.sequenceData[i] = 'N' else: self.sequenceData = "".join([self.sequenceData[:region.start], ("N" * (region.end - region.start + 1)), self.sequenceData[region.end + 1:]]) def maskRegions(self, regions, verbose=False): """ Mask the given regions in this sequence with Ns. :param region: iterable of regions to mask. Each region can be any object with .start and .end attributes. Co-ords are zero based and inclusive of both end points. Any other attributes (e.g. chrom.) are ignored. :param verbose: print status messages to stderr if True """ if verbose: pind = ProgressIndicator(totalToDo=len(regions), messagePrefix="completed", messageSuffix="of masking regions in " + self.name) for region in regions: self.maskRegion(region) if verbose: pind.done += 1 pind.showProgress() def isDNA(self): """ Make a guess as to whether this sequence is a DNA sequence or not by looking at the symbols it contains. :return: True if contains only DNA nucleotides, False otherwise """ for nuc in self.sequenceData: if nuc not in DNA_NUCS: return False return True def isRNA(self): """ Make a guess as to whether this sequence is an RNA sequence or not by looking at the symbols it contains. :return: True if contains only RNA nucleotides, False otherwise """ for nuc in self.sequenceData: if nuc not in RNA_NUCS: return False return True def toRNA(self): """ Convert this sequence in-place to an RNA sequence by changing any Ts to Us """ self.sequenceData = self.sequenceData.replace("T", "U") def toDNA(self): """ Convert this sequence in-place to a DNA sequence by changing any Us to Ts """ self.sequenceData = self.sequenceData.replace("U", "T") def split(self, point=None): """ Split this sequence into two halves and return them. The original sequence remains unmodified. :param point: defines the split point, if None then the centre is used :return: two Sequence objects -- one for each side """ if point is None: point = len(self) / 2 r1 = Sequence(self.name + ".1", self.sequenceData[:point]) r2 = Sequence(self.name + ".2", self.sequenceData[point:]) return r1, r2 def truncate(self, newLength): """ Truncate this sequence in-place so it's only <newLength> nucleotides long. :param newLength: the length to truncate this sequence to. """ return Sequence(self.name, self.sequenceData[:newLength]) def clip_end(self, seq, mm_score): """ Clip a sequence from the end of this sequence -- we assume the sequence to be clipped will always begin somewhere in this sequence, but may not be fully contained. If found, replaced with Ns. :param seq: sequence to be clipped :param mm_score: the number of matching bases needed to consider a hit, mm_score = len(seq) would be 100% match """ lim = mm_score - 1 other_end = len(seq) - 1 other_start = 0 for i in range(len(self.sequenceData) - 1, lim - 1, -1): self_end = i self_start = i - (len(seq) - 1) if self_start < 0: self_start = 0 other_start = other_end - self_end score = self.similarity(self_start, self_end, other_start, other_end, seq) if (score >= mm_score): self.nsRight(len(seq) + (len(self) - i) - 1) break def isPolyA(self): """ Determine whether this sequence is polyA. To be a polyA sequence, it must have > 90% Adenine. :return: True if the sequence is PolyA by the above definition. """ return self.percentNuc("A") >= 0.9 def isPolyT(self): """ Determine whether this sequence is polyT. To be a polyT sequence, it must have > 90% Thymine. :return: True if the sequence is PolyT by the above definition. """ return self.percentNuc("T") >= 0.9 def isLowQuality(self): """ Determine whether this is a low quality sequence. To be considered a low quality sequence, it must have > 10% Ns. :return: True if this sequence meets the above definition of low-quality. """ return self.percentNuc("N") >= 0.1 def maskMatch(self, mask): """ Determine whether this sequence matches the given mask. :param mask: string to match against. Ns in the mask are considered to match anything in the sequence -- all other chars must match exactly. :return: True if the mask matches at all places, otherwise false """ if len(mask) > len(self.sequenceData): return False lim = len(mask) for i in range(0, lim): if mask[i] == "N" or mask[i] == "n": continue if mask[i] != self.sequenceData[i]: return False return True ############################################################################# # SEQUENCE CLASS -- STRING FORMATTING # ############################################################################# def __str__(self): """ :return: string representation of this sequence object """ return self.to_fasta_str() def meta_data_to_string(self): """ """ res = "" first = True for k in self.meta_data: if first: first = False else: res += " " res += (str(k) + "=" + str(self.meta_data[k])) return res def to_fasta_str(self, line_width=50, include_coords=True): """ :return: string representation of this sequence object in fasta format """ res = ">" + self.name if include_coords: res += ":" + str(self.start) + "-" + str(self.end) res += " (" + str(self.remaining) + ")" m_str = self.meta_data_to_string() if m_str: res += (" " + m_str) res += "\n" for i in range(0, len(self.sequenceData), line_width): res += self.sequenceData[i:i + line_width] if i + line_width < len(self.sequenceData): res += "\n" return res