def index(self, sequence, seqset, append=False): if not append: self._match_position = self._MatchPosition() self.n_hitseqs = 0 self.n_hitsites = 0 self.n_seqs = 0 rc_sequence = revcomp(sequence) p = re.compile('({0})|({1})'.format( sequence.replace('n', '[atcg]'), rc_sequence.replace('n', '[atcg]')), re.IGNORECASE) seqid = 0 for gene_name, i in seqset: seqid += 1 i = i.lower() self.n_seqs += 1 has_match = False for j in p.finditer(i): has_match = True self.n_hitsites += 1 if self.reverse_complement: if j.group(1): self._match_position.add( gene_name, seqid, sequence, i, j.start(), j.end(), False) elif j.group(2): self._match_position.add( gene_name, seqid, rc_sequence, i, j.start(), j.end(), True) elif j.group(1): self._match_position.add( gene_name, seqid, sequence, i, j.start(), j.end(), False) if has_match: self.n_hitseqs += 1
def extract_match_info(self, fasta_handle): matches = set() match_info = [] for pattern in self._strands: pattern.build_matchtable_pset( parse_fasta(fasta_handle), self.reverse_complement) for pattern in self._strands: for seqid, pos in pattern.matchtable_pset.pos_matches.iteritems(): for i, j in enumerate(pos): gene_name = pattern.matchtable_pset.gene_name.get(seqid) match_strand, match_sequence = pattern.matchtable_pset.match_sequences.get(seqid)[i] match_start = pattern.matchtable_pset.pos_matches.get(seqid)[i][0] + 1 match = (seqid, match_strand, tuple(j)) if match in matches: continue matches.add(match) if match_strand == 2: match_sequence = revcomp(match_sequence) match_info.append((gene_name, match_strand, match_start, match_sequence)) return match_info
def pfm(pattern_sequence): """Claculate position frequency matrix (PFM) of matching sequences""" if isinstance(pattern_sequence, Pattern): sequences = [] for match_sequences in pattern_sequence.matchtable_pset.match_sequences.itervalues(): for strand, sequence in match_sequences: if strand == 2: sequences.append(revcomp(sequence)) else: sequences.append(sequence) else: sequences = pattern_sequence ncol = len(sequences[0]) matrix = { 'a': [0] * ncol, 't': [0] * ncol, 'c': [0] * ncol, 'g': [0] * ncol, } total = [0] * ncol for s in sequences: for i, j in enumerate(s): matrix.get(j)[i] += 1 total[i] += 1 # Normalization for i in xrange(ncol): matrix.get('a')[i] = float(matrix.get('a')[i]) / total[i] matrix.get('t')[i] = float(matrix.get('t')[i]) / total[i] matrix.get('c')[i] = float(matrix.get('c')[i]) / total[i] matrix.get('g')[i] = float(matrix.get('g')[i]) / total[i] return matrix
def extract_match_info(self, fasta_handle): pos_matches = {} seq_matches = {} match_info = [] for pattern in self._strands: pattern.build_matchtable_pset( parse_fasta(fasta_handle), self.reverse_complement) for pattern in self._strands: for seqid, pos in pattern.matchtable_pset.pos_matches.iteritems(): for i, j in enumerate(pos): gene_name = pattern.matchtable_pset.gene_name.get(seqid) match_strand, match_sequence = pattern.matchtable_pset.match_sequences.get(seqid)[i] match_start = pattern.matchtable_pset.pos_matches.get(seqid)[i][0] + 1 if match_strand == 2: match_sequence = revcomp(match_sequence) tj = tuple(j) if seqid in pos_matches and tj in pos_matches.get(seqid) and match_sequence in seq_matches.get(seqid).get(tj): continue if seqid in pos_matches: pos_matches.get(seqid).append(tj) else: pos_matches.update({seqid: [tj]}) if seqid in seq_matches: if tj in seq_matches.get(seqid): seq_matches.get(seqid).get(tj).append(match_sequence) else: seq_matches.get(seqid).update({tj: [match_sequence]}) else: seq_matches.update({seqid: {tj: [match_sequence]}}) match_info.append((gene_name, match_strand, match_start, match_sequence)) return match_info
def merge_sequences(seq_1, seq_2, reverse_complement=False): arrlen = len(seq_1) + len(seq_2) - 1 max_score = -1 best_seq1 = None best_seq2 = None best_strands = None strands_1 = [1, 1] for i in xrange(arrlen - len(seq_1) + 1): for j in xrange(arrlen - len(seq_2) + 1): if j > i: left_1 = '' left_2 = 'n' * (j - i) elif i > j: left_1 = 'n' * (i - j) left_2 = '' else: left_1 = '' left_2 = '' if len(seq_2) + j - 1 > len(seq_1) + i - 1: right_1 = 'n' * ((len(seq_2) + j - 1) - (len(seq_1) + i - 1)) right_2 = '' elif len(seq_1) + i - 1 > len(seq_2) + j - 1: right_1 = '' right_2 = 'n' * ((len(seq_1) + i - 1) - (len(seq_2) + j - 1)) else: right_1 = '' right_2 = '' s1 = '%s%s%s' % (left_1, seq_1, right_1) s2 = '%s%s%s' % (left_2, seq_2, right_2) score = full_alignment_scoring(s1, s2) if score > max_score: max_score = score best_seq1 = s1 best_seq2 = s2 best_strands = strands_1 if reverse_complement: strands_2 = [1, 2] seq_2 = revcomp(seq_2) for i in xrange(arrlen - len(seq_1) + 1): for j in xrange(arrlen - len(seq_2) + 1): if j > i: left_1 = '' left_2 = 'n' * (j - i) elif i > j: left_1 = 'n' * (i - j) left_2 = '' else: left_1 = '' left_2 = '' if len(seq_2) + j - 1 > len(seq_1) + i - 1: right_1 = 'n' * ((len(seq_2) + j - 1) - (len(seq_1) + i - 1)) right_2 = '' elif len(seq_1) + i - 1 > len(seq_2) + j - 1: right_1 = '' right_2 = 'n' * ((len(seq_1) + i - 1) - (len(seq_2) + j - 1)) else: right_1 = '' right_2 = '' s1 = '%s%s%s' % (left_1, seq_1, right_1) s2 = '%s%s%s' % (left_2, seq_2, right_2) score = full_alignment_scoring(s1, s2) if score > max_score: max_score = score best_seq1 = s1 best_seq2 = s2 best_strands = strands_2 return (best_seq1, best_seq2, best_strands)
def add(self, pattern): assert isinstance(pattern, Pattern) if self.reverse_complement and revcomp(pattern.sequence) in self._collect: self._collect.update({revcomp(pattern.sequence): pattern}) else: self._collect.update({pattern.sequence: pattern})