def run(self, seq_fasta_fn, complete_traceback):
        """Given a fasta file computes optimal structures using the nussinov algorithm
        If complete traceback then all optimal structures are returned, else 1

        Args:
          seq_fast_fn (str): A fasta file
          complete_traceback (bool): All optimal structures, if True, else 1
        """
        # sequences with their ids
        records = parse_fasta(seq_fasta_fn)

        results = []
        amount_pairs = []

        for r in records:
            sequence = str(r.seq)

            abstract_structures, amount_pair = self.compute_optimal_abstract_structure(
                sequence, complete_traceback)

            structures = self.convert_abstract_structure_to_structure(
                sequence, amount_pair, abstract_structures)

            results.append(structures)
            amount_pairs.append(amount_pair)

        return results, amount_pairs
Ejemplo n.º 2
0
    def run(self, seq1_fasta_fn, seq2_fasta_fn, subst_matrix_fn,
            is_distance_fn, affine_cost_gap_open, affine_cost_gap_extend,
            complete_traceback):
        """
            Compute all optimal pairwise alignments between the sequences
            in the given fasta file seq1_fasta_fn and seq2_fasta_fn

            Args:
              seq1_fasta_fn (str): The relative path to a fasta file
              seq2_fasta_fn (str): The relative path to a fasta file
              subst_matrix_fn (str): The relative path to a scoring matrix file
              is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure
              affine_cost_gap_open (int): gap cost open
              affine_cost_gap_extend (int): gap cost extend
              complete_traceback (bool): If True, returns all tracebacks, else 1

            Returns:
              list(list(str)) : A 2D-array containing information about the pairwise optimal alignments
            """
        # sequences with their ids
        records_f1 = parse_fasta(seq1_fasta_fn)
        records_f2 = parse_fasta(seq2_fasta_fn)

        # scoring function
        scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn,
                                       affine_cost_gap_open,
                                       affine_cost_gap_extend)

        # check if the sequences are legal
        if not check_sequences_alphabet(records_f1, SequenceType.PROTEIN) \
          or not check_sequences_alphabet(records_f2, SequenceType.PROTEIN):
            return None, Info.WRONG_ALPHABET

        # init result array
        result = [[None for _ in records_f2] for _ in records_f1]

        for i in range(len(records_f1)):
            record1 = records_f1[i]
            for j in range(len(records_f2)):
                record2 = records_f2[j]
                seq1 = str(record1.seq)
                seq2 = str(record2.seq)
                score, alignments = self.compute_optimal_alignments(
                    seq1, seq2, scoring_matrix, complete_traceback)
                result[i][j] = (record1, record2, alignments, score)
        return result, Info.OK
def test_parse_fast():
    records = parse_fasta("data/test.fasta")

    assert len(records) == 2

    record1 = records[0]
    record2 = records[1]

    assert str(record1.seq) == "FancySequenceA"
    assert str(record2.seq) == "FancysequenceB"

    assert record1.id == "idA"
    assert record2.id == "idB"
            results.append(structures)
            amount_pairs.append(amount_pair)

        return results, amount_pairs


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Nussinov command line tool")
    parser.add_argument("seq_fasta_fn", type=str)
    parser.add_argument("--c", "--complete_traceback", action='store_true')
    args = parser.parse_args()

    nussinov = Nussinov()

    # sequences with their ids
    records = parse_fasta(args.seq_fasta_fn)
    complete_traceback = args.c

    results, amount_pairs = nussinov.run(args.seq_fasta_fn, complete_traceback)

    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    print("Nussinov Results")
    # print("Maximal number of base pairs: %d" % amount_pairs)
    print(
        "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    for i in range(len(results)):
        sequence = str(records[i].seq)
        seqID = records[i].seq

        structures = results[i]
Ejemplo n.º 5
0
    def run(self, seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open,
            metrict_conversion_type, clustering):
        """
            Computes a XPGMA

            Args:
              seq_fasta_fn (str): The relative path to a fasta file
              subst_matrix_fn (str): The relative path to a scoring matrix file
              is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure
              cost_gap_open (int): gap cost open
              clustering (str): either "upgma" or "wpgma"

            Returns:
                new_cluster_node (Node): Root node of the XPGMA
                n
            """
        scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn,
                                       cost_gap_open)
        seq_records = parse_fasta(seq_fasta_fn)
        seqs = [str(x.seq) for x in seq_records]

        # cluster distance matrix, containing pairwise distance information
        m_size = 2 * len(
            seqs) - 1  # additional len(seqs) - 1 rows when merging clusters
        m = [[0 for i in range(m_size)] for j in range(m_size)]

        # iterationlist, containing the matrix row/col indices of the current clusters
        # this is used to avoid having to clean the matrix after merge
        l = [i for i in range(len(seqs))]  # initially only singleton clusters

        # cluster distance matrix index to Node mapping
        initial_cluster = [
            Node(seq_records[i]) for i in range(len(seq_records))
        ]
        n = dict(zip(list(range(len(initial_cluster))), initial_cluster))

        # compute pairwise distances using NW
        # Note: no check if matrix is distance matrix
        nw = NeedlemanWunsch()

        result, info = nw.run(seq_fasta_fn, seq_fasta_fn, subst_matrix_fn,
                              is_distance_fn, cost_gap_open, False)

        # initialize cluster distance matrix with computed distances
        for i in range(len(seqs)):
            for j in range(i + 1, len(seqs)):
                if scoring_matrix.metric_type == MetricType.DISTANCE:
                    m[i][j] = result[i][j][3]
                elif metrict_conversion_type == 0:
                    m[i][j] = -result[i][j][3]
                elif metrict_conversion_type == 1:
                    m[i][j] = similarity_to_distance(nw, result[i][j][2][0],
                                                     scoring_matrix)
                elif metrict_conversion_type == 2:
                    m[i][j] == similarity_to_distance_ext(
                        nw, result[i][j][2][0], scoring_matrix)

        #print("m")
        #for i in range(len(seqs)):
        #    for j in range(len(seqs)):
        #        print("%3d" % (m[i][j]), end='')
        #    print()

        if clustering == "wpgma":
            return self.generate_wpgma(m, l, n)
        elif clustering == "upgma":
            return self.generate_upgma(m, l, n)