Ejemplo n.º 1
0
    def index(self, sequence, seqset, append=False):
        if not append:
            self._match_position = self._MatchPosition()
            self.n_hitseqs = 0
            self.n_hitsites = 0
            self.n_seqs = 0

        rc_sequence = revcomp(sequence)
        p = re.compile('({0})|({1})'.format(
            sequence.replace('n', '[atcg]'), rc_sequence.replace('n', '[atcg]')),
            re.IGNORECASE)

        seqid = 0
        for gene_name, i in seqset:
            seqid += 1
            i = i.lower()
            self.n_seqs += 1
            has_match = False
            for j in p.finditer(i):
                has_match = True
                self.n_hitsites += 1
                if self.reverse_complement:
                    if j.group(1):
                        self._match_position.add(
                            gene_name, seqid, sequence, i, j.start(), j.end(), False)
                    elif j.group(2):
                        self._match_position.add(
                            gene_name, seqid, rc_sequence, i, j.start(), j.end(), True)
                elif j.group(1):
                    self._match_position.add(
                        gene_name, seqid, sequence, i, j.start(), j.end(), False)
            if has_match:
                self.n_hitseqs += 1
Ejemplo n.º 2
0
    def extract_match_info(self, fasta_handle):
        matches = set()
        match_info = []

        for pattern in self._strands:
            pattern.build_matchtable_pset(
                parse_fasta(fasta_handle), self.reverse_complement)

        for pattern in self._strands:
            for seqid, pos in pattern.matchtable_pset.pos_matches.iteritems():
                for i, j in enumerate(pos):
                    gene_name = pattern.matchtable_pset.gene_name.get(seqid)
                    match_strand, match_sequence = pattern.matchtable_pset.match_sequences.get(seqid)[i]
                    match_start = pattern.matchtable_pset.pos_matches.get(seqid)[i][0] + 1

                    match = (seqid, match_strand, tuple(j))
                    if match in matches:
                        continue

                    matches.add(match)

                    if match_strand == 2:
                        match_sequence = revcomp(match_sequence)

                    match_info.append((gene_name, match_strand, match_start, match_sequence))

        return match_info
Ejemplo n.º 3
0
def pfm(pattern_sequence):
    """Claculate position frequency matrix (PFM) of matching sequences"""
    if isinstance(pattern_sequence, Pattern):
        sequences = []
        for match_sequences in pattern_sequence.matchtable_pset.match_sequences.itervalues():
            for strand, sequence in match_sequences:
                if strand == 2:
                    sequences.append(revcomp(sequence))
                else:
                    sequences.append(sequence)
    else:
        sequences = pattern_sequence

    ncol = len(sequences[0])
    matrix = {
        'a': [0] * ncol,
        't': [0] * ncol,
        'c': [0] * ncol,
        'g': [0] * ncol,
    }
    total = [0] * ncol

    for s in sequences:
        for i, j in enumerate(s):
            matrix.get(j)[i] += 1
            total[i] += 1

    # Normalization
    for i in xrange(ncol):
        matrix.get('a')[i] = float(matrix.get('a')[i]) / total[i]
        matrix.get('t')[i] = float(matrix.get('t')[i]) / total[i]
        matrix.get('c')[i] = float(matrix.get('c')[i]) / total[i]
        matrix.get('g')[i] = float(matrix.get('g')[i]) / total[i]

    return matrix
Ejemplo n.º 4
0
    def extract_match_info(self, fasta_handle):
        pos_matches = {}
        seq_matches = {}
        match_info = []

        for pattern in self._strands:
            pattern.build_matchtable_pset(
                parse_fasta(fasta_handle), self.reverse_complement)

        for pattern in self._strands:
            for seqid, pos in pattern.matchtable_pset.pos_matches.iteritems():
                for i, j in enumerate(pos):
                    gene_name = pattern.matchtable_pset.gene_name.get(seqid)
                    match_strand, match_sequence = pattern.matchtable_pset.match_sequences.get(seqid)[i]
                    match_start = pattern.matchtable_pset.pos_matches.get(seqid)[i][0] + 1
                    if match_strand == 2:
                        match_sequence = revcomp(match_sequence)

                    tj = tuple(j)
                    if seqid in pos_matches and tj in pos_matches.get(seqid) and match_sequence in seq_matches.get(seqid).get(tj):
                        continue

                    if seqid in pos_matches:
                        pos_matches.get(seqid).append(tj)
                    else:
                        pos_matches.update({seqid: [tj]})

                    if seqid in seq_matches:
                        if tj in seq_matches.get(seqid):
                            seq_matches.get(seqid).get(tj).append(match_sequence)
                        else:
                            seq_matches.get(seqid).update({tj: [match_sequence]})
                    else:
                        seq_matches.update({seqid: {tj: [match_sequence]}})

                    match_info.append((gene_name, match_strand, match_start, match_sequence))

        return match_info
Ejemplo n.º 5
0
def merge_sequences(seq_1, seq_2, reverse_complement=False):
    arrlen = len(seq_1) + len(seq_2) - 1
    max_score = -1
    best_seq1 = None
    best_seq2 = None
    best_strands = None

    strands_1 = [1, 1]
    for i in xrange(arrlen - len(seq_1) + 1):
        for j in xrange(arrlen - len(seq_2) + 1):
            if j > i:
                left_1 = ''
                left_2 = 'n' * (j - i)
            elif i > j:
                left_1 = 'n' * (i - j)
                left_2 = ''
            else:
                left_1 = ''
                left_2 = ''

            if len(seq_2) + j - 1 > len(seq_1) + i - 1:
                right_1 = 'n' * ((len(seq_2) + j - 1) - (len(seq_1) + i - 1))
                right_2 = ''
            elif len(seq_1) + i - 1 > len(seq_2) + j - 1:
                right_1 = ''
                right_2 = 'n' * ((len(seq_1) + i - 1) - (len(seq_2) + j - 1))
            else:
                right_1 = ''
                right_2 = ''

            s1 = '%s%s%s' % (left_1, seq_1, right_1)
            s2 = '%s%s%s' % (left_2, seq_2, right_2)
            score = full_alignment_scoring(s1, s2)
            if score > max_score:
                max_score = score
                best_seq1 = s1
                best_seq2 = s2
                best_strands = strands_1

    if reverse_complement:
        strands_2 = [1, 2]
        seq_2 = revcomp(seq_2)

        for i in xrange(arrlen - len(seq_1) + 1):
            for j in xrange(arrlen - len(seq_2) + 1):
                if j > i:
                    left_1 = ''
                    left_2 = 'n' * (j - i)
                elif i > j:
                    left_1 = 'n' * (i - j)
                    left_2 = ''
                else:
                    left_1 = ''
                    left_2 = ''

                if len(seq_2) + j - 1 > len(seq_1) + i - 1:
                    right_1 = 'n' * ((len(seq_2) + j - 1) - (len(seq_1) + i - 1))
                    right_2 = ''
                elif len(seq_1) + i - 1 > len(seq_2) + j - 1:
                    right_1 = ''
                    right_2 = 'n' * ((len(seq_1) + i - 1) - (len(seq_2) + j - 1))
                else:
                    right_1 = ''
                    right_2 = ''

                s1 = '%s%s%s' % (left_1, seq_1, right_1)
                s2 = '%s%s%s' % (left_2, seq_2, right_2)
                score = full_alignment_scoring(s1, s2)
                if score > max_score:
                    max_score = score
                    best_seq1 = s1
                    best_seq2 = s2
                    best_strands = strands_2

    return (best_seq1, best_seq2, best_strands)
Ejemplo n.º 6
0
 def add(self, pattern):
     assert isinstance(pattern, Pattern)
     if self.reverse_complement and revcomp(pattern.sequence) in self._collect:
         self._collect.update({revcomp(pattern.sequence): pattern})
     else:
         self._collect.update({pattern.sequence: pattern})