Python FriendClass.getSubsMatScore Examples

Programming Language: Python

Namespace/Package Name: friend

Class/Type: FriendClass

Method/Function: getSubsMatScore

Examples at hotexamples.com: 5

Python FriendClass.getSubsMatScore - 5 examples found. These are the top rated real world Python examples of friend.FriendClass.getSubsMatScore extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FriendClass(10)

getSubsMatScore(5)

parseMultSequenceFastaFile(3)

validateAminoSequence(3)

parseFastaFiles(1)

validateRNASequence(1)

Example #1

Show file

File: feng_doolittle.py Project: ashwath92/bioinfromaticsLab

 def sumOfPairs(self):
     """ This function gets the sum of pairs score of the multiple sequence
     alignment."""
     sumOfPairs = 0
     # Calls the FriendClass's getSubsMatScore function.
     fr = FriendClass()
     for i, alignment1 in enumerate(self.MSA):
         for j, alignment2 in enumerate(self.MSA):
             if i == j or i > j:
                 continue
             else:
                 # All the alignments are of the same length.
                 for index in range(len(alignment2)):
                     if alignment1[index] == 'X' and alignment2[
                             index] == 'X':
                         continue
                     elif ((alignment1[index] == 'X'
                            and alignment2[index] != 'X')
                           or (alignment1[index] != 'X'
                               and alignment2[index] == 'X')):
                         sumOfPairs += self.gapOpenCost
                     # if both of the alignments have amino-acid characters
                     else:
                         sumOfPairs += fr.getSubsMatScore(
                             alignment1[index], alignment2[index],
                             self.subsMat, self.gapOpenCost)
     return sumOfPairs

Example #2

Show file

    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_open_cost,
                      gap_extend_cost, g):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        # assign a high negative number to infinity, which will be used in initialization of P and Q matrices
        inf = -60000
        D = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # P matrix is used to extend gaps in Sequence 2
        P = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        # Q matrix is used to extend gaps in Sequence 1
        Q = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        D[0, 1] = D[1, 0] = g
        P[0, 1] = inf
        Q[1, 0] = inf
        for i in range(2, s1_length + 1):
            D[i, 0] = D[i - 1, 0] + gap_extend_cost
            # P does not need to be initialized in the 1st column. These values are not used in the algorithm
            Q[i, 0] = inf
        for j in range(2, s2_length + 1):
            D[0, j] = D[0, j - 1] + gap_extend_cost
            P[0, j] = inf
            # Q does not need to be initialized in the 1st column. These values are not used in the algorithm
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                #D_i-1,j + g
                # Update P[i,j] -> we can either extend the gap from the previous row in P or create a new gap in Seq 1, which
                # means that we need to take the previous row's value in D (We don't take into account different values of j)
                P[i, j] = max(D[i - 1, j] + g, P[i - 1, j] + gap_extend_cost)
                #Next, update Q[i,j] -> we can either extend the gap from the previous col in Q or create a new gap in Seq 1, which
                # means that we need to take the previous col's value in D (We don't take into account different values of i)
                Q[i, j] = max(D[i, j - 1] + g, Q[i, j - 1] + gap_extend_cost)
                # Finally, update D[i,j]: it is the max of the substitution score (match/mismatch), and the resp. P[i,j] and Q[i,j],
                # which correspond to gap extension in seq 2 and seq 1 respectively
                substitution = D[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_extend_cost)
                D[i, j] = max(substitution, P[i, j], Q[i, j])

        optimalScore = D[s1_length][s2_length]
        return D, P, Q, optimalScore

Example #3

Show file

File: needleman_wunsch.py Project: ashwath92/bioinfromaticsLab

    def buildMatrices(self, s1, s2, subst_matrix_fn, gap_cost):
        """ This function creates the Needleman-Wunsch matrix, taking the sequences,
        the type of substitution matrix and the gap opening cost as arguments. It also
        cretes a traceback matrix which can be used in a later function to compute the
        optimal alignments"""
        s1_length = len(s1)
        s2_length = len(s2)
        nw_matrix = np.zeros((s1_length + 1, s2_length + 1), dtype=int)
        traceback = np.zeros((s1_length, s2_length), dtype=int)
        fr = FriendClass()

        #Initialize
        for i in range(1, s1_length + 1):
            nw_matrix[i, 0] = nw_matrix[i - 1, 0] + gap_cost
        for j in range(1, s2_length + 1):
            nw_matrix[0, j] = nw_matrix[0, j - 1] + gap_cost
        # sequence 1 is on the left and sequence 2 is on top
        for i in range(1, s1_length + 1):
            for j in range(1, s2_length + 1):
                # Cost of inserting a gap into seq 1
                seq1_gap = nw_matrix[i, j - 1] + gap_cost
                # Cost of inserting a gap into seq 2
                seq2_gap = nw_matrix[i - 1, j] + gap_cost
                # Cost of a match/mismatch
                # i-1, j-1 to index the strings as the i and j loops start with value 1
                substitution = nw_matrix[i - 1, j - 1] + fr.getSubsMatScore(
                    s1[i - 1], s2[j - 1], subst_matrix_fn, gap_cost)

                nw_matrix[i][j] = max(seq1_gap, seq2_gap, substitution)
                # Store which direction we came from, we need this for traceback
                # traceback is a s1.length x s2.length matrix, so we need to index
                # from [0][0], so we use [i-1][j-1]
                """ We add 1 whenever the value was caluclated from seq1_gap, we add 2
                when the value was calculated from seq2_gap, and add 4 when the value
                was calculated from a substitution. We get the values 5,6,7 when the
                value came from 2 or 3 directions (i.e. combinations of seq1_gap, seq2_gap
                and substitutions (1,2 and 4). Note that there are three ifs and not elifs,
                so all 3 have conditions are checked, and the values are added."""

                if seq1_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 1
                if seq2_gap == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 2
                if substitution == max(seq1_gap, seq2_gap, substitution):
                    traceback[i - 1][j - 1] += 4
        optimalScore = nw_matrix[s1_length][s2_length]
        return traceback, optimalScore

Example #4

Show file

    def similarityToDistance(self, s_ab, a, b, nw, alignment, subsMat,
                             gapOpenCost):
        """ This function converts a similarity score to a distance score."""

        # 1. Calculate S(a,b)_rand using the formula on this page, but with linear gap costs:
        # http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Feng-Doolittle
        # Find length of the sequence
        L = len(alignment[0])  # same length for alignment[0],[1] and [2]
        # Find number of gaps in alignment[0] and alignment[2]
        N_g = alignment[0].count('-') + alignment[2].count('-')
        fr = FriendClass()
        sum_xy = 0
        # Randomize a and b to calculate s_rand.
        list_a = list(a)
        list_b = list(b)
        random.shuffle(list_a)
        random.shuffle(list_b)
        rand_a = "".join(list_a)
        rand_b = "".join(list_b)
        for i, x in enumerate(a):
            for j, y in enumerate(b):
                s_xy = fr.getSubsMatScore(rand_a[i], rand_b[j], subsMat,
                                          gapOpenCost)
                Na_x = a.count(x)
                Nb_y = b.count(y)
                sum_xy += (Na_x * Nb_y * s_xy)
        s_ab_rand = (sum_xy / L) + (N_g * gapOpenCost)

        # 2. Calculate s_ab_max
        (traceback_aa, s_aa) = nw.buildMatrices(a, a, subsMat, gapOpenCost)
        (traceback_bb, s_bb) = nw.buildMatrices(b, b, subsMat, gapOpenCost)
        s_ab_max = (s_aa + s_bb) / 2
        #s_ab_eff is the normalized similarity: between 0 and 1.
        s_ab_eff = (s_ab - s_ab_rand) / (s_ab_max - s_ab_rand)
        d = -math.log(s_ab_eff)
        return d

Example #5

Show file

    def getAlignmentsFromTracebacks(self, s1, s2, subst_matrix_fn, D, P, Q,
                                    alpha, beta, g):
        """This function takes as input the matrices D, P and Q created in an earlier function. It
        computes the traceback by essentially reversing the process of building the matrices.
         It returns a list of lists containing the alignment."""

        indices_list = [[]]
        trace_list = [[]]
        fr = FriendClass()
        # Set i and j to the index of the last row and column of the ndarray D respectively
        i = D.shape[0] - 1
        j = D.shape[1] - 1
        indices_list[0] = [i, j]
        trace_list[0] = ["", "", ""]
        indices_duplicate = copy.deepcopy(
            indices_list
        )  # A copy of indices list is needed for going through the for loop below
        while True:
            completed_counter = 0  #This counter will be set to the number of tracebacks found.
            for index, [i, j] in enumerate(indices_duplicate):

                if i == 0 and j == 0:
                    # We reach here only when we have got the complete sequence
                    completed_counter += 1  #increment indicates that we have got 1 more complete traceback
                    continue

                if i == 0 and j >= 0:
                    # We reach here only when s1 has reached the beginning of the sequence
                    trace_list[index][0] += '-'
                    trace_list[index][1] += s2[j]
                    trace_list[index][2] += ' '
                    indices_list[index][1] -= 1
                    continue

                if i >= 0 and j == 0:
                    # We reach here only when s2 has reached the beginning of the sequence
                    trace_list[index][0] += s1[i]
                    trace_list[index][1] += '-'
                    trace_list[index][2] += ' '
                    indices_list[index][0] -= 1
                    continue
                # indicates that the value in D[i,j] came from P, Q and D
                if D[i, j] == P[i, j] and D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    trace_list.append(copy.deepcopy(trace_list[index]))
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #first copy
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #second copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    trace_list[index][0] += s1[i2:i]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                    # treat traceback[third] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    third = len(trace_list) - 2
                    trace_list[third][0] += s1[i - 1]
                    trace_list[third][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[third][2] += '*'
                    else:
                        trace_list[third][2] += ':'
                    indices_list[third][0] -= 1
                    indices_list[third][1] -= 1

                # The value in D[i,j] came from P and Q, not from D.
                elif D[i, j] == P[i, j] and D[i, j] == Q[i, j]:
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from Q[i,j]
                    # second will store the index of the newly duplicated list
                    # (it will always be at the end because that's how append works)
                    second = len(trace_list) - 1
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[second][1] -= diff_j
                    trace_list[second][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[second][1] += s2[j2:j][::-1]
                    trace_list[second][2] += ' ' * diff_j

                # D[i,j] came from P and D
                elif D[i, j] == P[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from D and Q.
                elif D[i, j] == Q[i, j] and D[
                        i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                            s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list.append(
                        copy.deepcopy(trace_list[index])
                    )  # we need to split the trace_list sublist into 3 lists
                    indices_list.append(copy.deepcopy(
                        indices_list[index]))  #copy
                    # treat traceback[index] as the list where the traceback has come from P[i,j]
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                    # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j)
                    second = len(trace_list) - 1
                    trace_list[second][0] += s1[i - 1]
                    trace_list[second][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[second][2] += '*'
                    else:
                        trace_list[second][2] += ':'
                    indices_list[second][0] -= 1
                    indices_list[second][1] -= 1

                # D[i,j] came from only P
                #indicates that a gap has been added in Sequence 1, so we have to decrement i

                elif D[i, j] == P[i, j]:
                    # i2 is the index after calling tracebackInP: after traversing through the
                    # P matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length, and closing the gap.
                    i2 = self.tracebackInP(P, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_i gives the number of the gap we have to insert in sequence 2, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index.
                    diff_i = i - i2
                    indices_list[index][0] -= diff_i
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to
                    # trace_list in the reverse order
                    trace_list[index][0] += s1[i2:i][::-1]
                    trace_list[index][1] += '-' * diff_i
                    trace_list[index][2] += ' ' * diff_i

                # D[i,j] came from only Q.
                #indicates that a gap has been added in Sequence 2, so we have to decrement i
                elif D[i, j] == Q[i, j]:
                    # j2 is the index after calling tracebackInQ: after traversing through the
                    # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending
                    # it to a certain length and closing the gap.
                    j2 = self.tracebackInQ(Q, D, indices_list[index][0],
                                           indices_list[index][1], beta, g)
                    # diff_j gives the number of gaps we have to insert in sequence 1, and in
                    # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index.
                    diff_j = j - j2
                    indices_list[index][1] -= diff_j
                    trace_list[index][0] += '-' * diff_j
                    # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices
                    # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to
                    # trace_list in the reverse order
                    trace_list[index][1] += s2[j2:j][::-1]
                    trace_list[index][2] += ' ' * diff_j

                # D[i,j] came from D[i-1,j-1]
                # Indicates a substitution
                elif D[i, j] == D[i - 1, j - 1] + fr.getSubsMatScore(
                        s1[i - 1], s2[j - 1], subst_matrix_fn, beta):
                    trace_list[index][0] += s1[i - 1]
                    trace_list[index][1] += s2[j - 1]
                    if s1[i - 1] == s2[j - 1]:
                        trace_list[index][2] += '*'
                    else:
                        trace_list[index][2] += ':'
                    indices_list[index][0] -= 1
                    indices_list[index][1] -= 1

            # indices_duplicate, the for loop variable, needs to store the updated value of indices_list before the next loop starts
            indices_duplicate = copy.deepcopy(indices_list)
            # when the number of indices (same as no. of tracebacks) is equal to the 'done counter', which is incremented once for
            # each traceback, we can break out of the while(True) infinite loop
            if completed_counter == len(indices_duplicate):
                break
        # As trace_list contains all the strings (S1, S2 and connect) in the opposite order, they need to be reversed.
        alignment_strings = [[string[::-1] for string in trace]
                             for trace in trace_list]
        return alignment_strings