def convert_to_evolutionary_distances(pairwise_alignment_result: Result, similarity_scoring_method, nw_settings) -> float: """Converts similarity score from a pairwise alignment to a distance score using approximation algorithm D(a,b) = - log(S_{a,b}^{eff}) S_{a,b}^{eff} = (S(a,b) - S_{rand}) / (S_{a,b}^{max} - S_{rand}) S_{rand} = (1/|A|) * (sum_{x,y in \Sigma \times \Sigma} S(x,y) * N_a(x) * N_b(y)) + gaps(A) * S(-,*) S_{a,b}^{max} = (S(a,a) + S(b,b)) / 2 """ alignment = pairwise_alignment_result.alignments[0] LOGGER.info("Converting similarity to evolutionary distances.") LOGGER.info("Alignment: %s" % alignment) seq1 = copy.deepcopy(alignment.sequence1) seq1.seq = seq1.seq.replace("-", "") seq2 = copy.deepcopy(alignment.sequence2) seq2.seq = seq2.seq.replace("-", "") nw = NeedlemanWunsch(settings=nw_settings) s_ab = nw.run(seq1, seq2) s_aa = nw.run(seq1, seq1) s_bb = nw.run(seq2, seq2) s_max = (s_aa.score + s_bb.score) / 2 if similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE_EXTENDED: s_rand = (1 / len(alignment.sequence1)) * \ sum([nw.score(nw.alphabet.letters[i], nw.alphabet.letters[j]) * count_occurences_symbol_in_word(seq1.seq, nw.alphabet.letters[i]) * count_occurences_symbol_in_word(seq2.seq, nw.alphabet.letters[j]) for i in range(len(nw.alphabet.letters)) for j in range(len(nw.alphabet.letters))]) \ + count_gaps_in_pairwise_alignment(alignment) * nw.gap_penalty elif similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE: # copy sequences to no permanently change them seq1_shuffled = copy.deepcopy(seq1) seq2_shuffled = copy.deepcopy(seq2) # shuffle letters. seq1_shuffled.seq = ''.join(random.sample(seq1.seq, len(seq1))) seq2_shuffled.seq = ''.join(random.sample(seq2.seq, len(seq2))) s_rand = (nw.run(seq1_shuffled, seq2_shuffled)).score else: raise NotImplementedError( f'similarity_scoring_method {similarity_scoring_method} not supported/implemented.') # prevent division by zero. if s_max == s_rand: s_rand = s_rand - 0.0001 s_eff = (s_ab.score - s_rand) / (s_max - s_rand) # negative values make no sense. if s_eff <= 0.0: score = 1 else: score = - math.log(s_eff) LOGGER.info("New score: %.5f" % score) return score
def compute_best_alignment_many_to_many(self, alignment1: MultiAlignment, alignment2: MultiAlignment): """ Function which finds the best alignment, by calculating alignment between two lists of sequences. :param alignment1: MultiAlignment object :param alignment2: MultiAlignment object :return: best_alignment, index in alignment1, index in alignment2, best_score, overall_score """ best_alignment = None index1 = None index2 = None best_score = None overall_score = 0 sequences1 = alignment1.sequences sequences2 = alignment2.sequences nw = NeedlemanWunsch(settings=self.nw_settings) for i, seq1 in enumerate(sequences1): for j, seq2 in enumerate(sequences2): result = nw.run(seq1, seq2) if best_score is None or result.score > best_score: best_score = result.score best_alignment = result.alignments[0] index1 = i index2 = j # the score is the addition of all pairwise scores. overall_score += result.score return [best_alignment.sequence1, best_alignment.sequence2], index1, index2, best_score, overall_score
def test_guideline_blosum(): """Test cases given on the guideline from 04.02.2019 """ nw = NeedlemanWunsch() result, info = nw.run("data/xpgma_guideline.fasta", "data/xpgma_guideline.fasta", "data/blosum62.txt", False, 6, True) # the results is a upper triangle matrix of shape n x n. seq1_seq2 = result[0][1] assert seq1_seq2[3] == 4 assert len(seq1_seq2[2]) == 8 assert seq1_seq2[2][0] == ( 'ILDMDVVEGSAARFDCKVEG_YPDPEVMWFKDDNP__V_KESRHFQIDYDEEGN', 'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHF_V__SQT_T') seq1_seq3 = result[0][2] assert seq1_seq3[3] == 37 assert len(seq1_seq3[2]) == 4 assert seq1_seq3[2][0] == ( 'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPL_ASQN_RVEV__LA_') seq1_seq4 = result[0][3] assert seq1_seq4[3] == -4 assert len(seq1_seq4[2]) == 1 assert seq1_seq4[2][0] == ( 'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD____') seq2_seq3 = result[1][2] assert seq2_seq3[3] == 3 assert len(seq2_seq3[2]) == 1 assert seq2_seq3[2][0] == ( 'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT', 'ISDTEADIGSNLRWGC_AAAGKPRPMVRWLRNGEP__LASQNR__VEVLA') seq2_seq4 = result[1][3] assert seq2_seq4[3] == 9 assert len(seq2_seq4[2]) == 2 assert seq2_seq4[2][0] == ( 'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT', 'RRLIPAARGGEISILCQPRA_APKATILW__SKGTEILGNSTRVTVT_SD') seq3_seq4 = result[2][3] assert seq3_seq4[3] == 24 assert len(seq3_seq4[2]) == 1 assert seq3_seq4[2][0] == ( 'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA_', 'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD')
def test_example_distance(): """Test using distance scoring function""" nw = NeedlemanWunsch() result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta", "data/test_scoring_distance.txt", True, 1, True) assert result[0][0][0].id == "idA" assert result[0][0][1].id == "idB" assert str(result[0][0][0].seq) == "TCCGA" assert str(result[0][0][1].seq) == "TACGCAGA" assert result[0][0][3] == -2 assert len(result[0][0][2]) == 1 assert result[0][0][2][0] == ("T_C_C_GA", "TACGCAGA")
def test_example(): """Example testing the dummy implementation.""" nw = NeedlemanWunsch() result = nw.run("data/sequence1.fa", "data/sequence2.fa", "data/blosum62.txt", 5, False) (id_seq1, seq1, id_seq2, seq2, score, alignments) = result assert id_seq1 == "idA" assert id_seq2 == "idB" assert seq1 == "FancySequenceA" assert seq2 == "FancysequenceB" assert score == 1000 assert alignments[0] == ("Fancy_SequenceA_", "Fancys_equence_B")
def test_example_invalid_characters_fail(): """This function does a negative test: it checks if it fails when it is supposed to. The reason for failure is non-amino acid characters in file 2 (error code 12)""" nw = NeedlemanWunsch() seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta') seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_characters.fasta') with pytest.raises(SystemExit) as InvalidCharactersException: result = nw.run(seq_fasta_2, seq_fasta_1, 'pam250', -8, False) (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result assert InvalidCharactersException.type == SystemExit assert InvalidCharactersException.code == 12
def test_example_invalid_format_fail(): """This function does a negative test: it checks if it fails when it is supposed to. The reason for the failure is invalid file format: the first line does not start with >""" nw = NeedlemanWunsch() seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta') seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_format.fasta') with pytest.raises(SystemExit) as InvalidFileException: result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, False) (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result assert InvalidFileException.type == SystemExit assert InvalidFileException.code == 1
def test_example_similarity(): """Test using similarity scoring function """ nw = NeedlemanWunsch() result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta", "data/test_scoring_similarity.txt", True, 1, True) assert result[0][0][0].id == "idA" assert result[0][0][1].id == "idB" assert str(result[0][0][0].seq) == "TCCGA" assert str(result[0][0][1].seq) == "TACGCAGA" assert result[0][0][3] == 4 assert len(result[0][0][2]) == 6 assert result[0][0][2][0] == ("__TCCGA_", "TACGCAGA")
def test_too_few_arguments(): """This function does a negative test: it checks if it fails when it is supposed to. The reason for failure is non-amino acid characters in file 2 (error code 12)""" nw = NeedlemanWunsch() seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta') seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta') # test is a variable which becomes True when there are too few arguments test = False try: with pytest.raises(SystemExit) as TooFewArguments: result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', False) (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result # A TypeError is thrown when there are too few arguments (we are missing 1 argument) except TypeError: test = True assert test == True
def compute_best_alignment_one_to_many(self, leaf: Node, alignment: MultiAlignment): """ Function which finds the best alignment, by calculating alignments between a sequence and many sequences. :param leaf: Node object which is a leaf :param alignment: MultiAlignment object :return: alignment, index of best alignment, alignment score. """ assert leaf.is_leaf() best_alignment = None index = None best_score = None leaf_sequence = leaf.sequence sequences = alignment.sequences nw = NeedlemanWunsch(settings=self.nw_settings) for i, seq in enumerate(sequences): result = nw.run(leaf_sequence, seq) if best_score is None or result.score > best_score: best_score = result.score best_alignment = result.alignments[0] index = i return [best_alignment.sequence1, best_alignment.sequence2], index, best_score
def test_example_success(): """This calls the run method of the Needleman-Wunsch program and tests if it works as expected (positive test)""" nw = NeedlemanWunsch() seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta') seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta') result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, True) (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result print(alignments) assert id_seq1 == "ID1" assert id_seq2 == "ID2" assert seq1 == "ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN" assert seq2 == "RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT" assert score == 31 assert alignments == [[ 'ILDMDVVEGSAARFDCKVEG-YPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTD-GRHFVSQTT', ':::::::**::::::*:::: **:::::*:::::*:::::: :::::::::' ]] assert num_alignments == 1
def operation1(self, leaf1: Node, leaf2: Node) -> MultiAlignment: """ Compute best pairwise alignment, change occurences of gap symbol to X :param leaf1: Node object :param leaf2: Node object :return: MultiAlignment object >>> from Bio.SeqRecord import SeqRecord >>> feng = FengDoolittle() >>> res = feng.operation1(leaf1=Node(sequence=SeqRecord("AAACGA"),name=None, cost=None),\ leaf2=Node(sequence=SeqRecord("AAA"), name=None,cost=None)) >>> res.sequences[0].seq 'AAACGA' >>> res.sequences[1].seq 'XAAXXA' """ assert leaf1.is_leaf() and leaf2.is_leaf() nw = NeedlemanWunsch(settings=self.nw_settings) result = nw.run(leaf1.sequence, leaf2.sequence) multi_alignment = MultiAlignment(sequences=[result.alignments[0].sequence1, result.alignments[0].sequence2], score=result.score) multi_alignment.sequences = replace_with_neutral_symbol(multi_alignment.sequences) return multi_alignment
def run(self, seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open, metrict_conversion_type, clustering): """ Computes a XPGMA Args: seq_fasta_fn (str): The relative path to a fasta file subst_matrix_fn (str): The relative path to a scoring matrix file is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure cost_gap_open (int): gap cost open clustering (str): either "upgma" or "wpgma" Returns: new_cluster_node (Node): Root node of the XPGMA n """ scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn, cost_gap_open) seq_records = parse_fasta(seq_fasta_fn) seqs = [str(x.seq) for x in seq_records] # cluster distance matrix, containing pairwise distance information m_size = 2 * len( seqs) - 1 # additional len(seqs) - 1 rows when merging clusters m = [[0 for i in range(m_size)] for j in range(m_size)] # iterationlist, containing the matrix row/col indices of the current clusters # this is used to avoid having to clean the matrix after merge l = [i for i in range(len(seqs))] # initially only singleton clusters # cluster distance matrix index to Node mapping initial_cluster = [ Node(seq_records[i]) for i in range(len(seq_records)) ] n = dict(zip(list(range(len(initial_cluster))), initial_cluster)) # compute pairwise distances using NW # Note: no check if matrix is distance matrix nw = NeedlemanWunsch() result, info = nw.run(seq_fasta_fn, seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open, False) # initialize cluster distance matrix with computed distances for i in range(len(seqs)): for j in range(i + 1, len(seqs)): if scoring_matrix.metric_type == MetricType.DISTANCE: m[i][j] = result[i][j][3] elif metrict_conversion_type == 0: m[i][j] = -result[i][j][3] elif metrict_conversion_type == 1: m[i][j] = similarity_to_distance(nw, result[i][j][2][0], scoring_matrix) elif metrict_conversion_type == 2: m[i][j] == similarity_to_distance_ext( nw, result[i][j][2][0], scoring_matrix) #print("m") #for i in range(len(seqs)): # for j in range(len(seqs)): # print("%3d" % (m[i][j]), end='') # print() if clustering == "wpgma": return self.generate_wpgma(m, l, n) elif clustering == "upgma": return self.generate_upgma(m, l, n)