Ejemplo n.º 1
0
    def generatePoaGraph(self):
        """
        Given list of subreads, generate MSA using POA graphs
        """
        subreads = list(self.subreads)
        if self.reference:
            # the seeded sequence is the reference
            root_subread = self.reference
            self.root_subread = root_subread
            root_seq = str(root_subread.sequence)
            root_label = root_subread.header
        else:
            # no reference provided, align against subread
            root_subread = subreads.pop()
            self.root_subread = root_subread
            root_seq = root_subread.read(aligned=False)
            root_label = root_subread.qName
        graph = poagraph.POAGraph(root_seq, label=root_label)
        for subread in subreads:
            # uses levenshtein distance to determine if sequence should
            # be reverse-complemented before being added to the POA MSA
            subread_seq = self._check_direction(subread.read(aligned=False),
                                                root_seq)

            subread_label = subread.qName
            alignment = seqgraphalignment.SeqGraphAlignment(subread_seq,
                                                            graph,
                                                            fastMethod=True,
                                                            globalAlign=True)
            graph.incorporateSeqAlignment(alignment,
                                          subread_seq,
                                          label=subread_label)

        return graph
Ejemplo n.º 2
0
def test_equal_strings(fast, glob, n_sequences):
    sequence = "TATACCGGCG"
    sequences = [sequence]*n_sequences

    graph = poagraph.POAGraph(sequences[0], "0")
    for i in range(1, len(sequences)):
        alignment = seqgraphalignment.SeqGraphAlignment(sequences[i], graph,
                                                        fastMethod=fast,
                                                        globalAlign=glob,
                                                        matchscore=1,
                                                        mismatchscore=-1,
                                                        gapscore=-2)

        graph.incorporateSeqAlignment(alignment, sequence, str(i))

    alignments = graph.generateAlignmentStrings()
    matches = [alignments[0][1] == alignstr for _, alignstr in alignments]
    assert all(matches)
Ejemplo n.º 3
0
def generate_poa_graph(sequences):
    """
    Initialize graph and align all sequences
    :param sequences: sequences to align
    :return: graph: the completed POA graph resulting from the given sequences
    """
    init_sequence = sequences[0]
    init_label = "0"

    graph = poagraph.POAGraph(init_sequence, init_label)

    for i in range(1, len(sequences)):
        sequence = sequences[i]
        alignment = seqgraphalignment.SeqGraphAlignment(sequence,
                                                        graph,
                                                        fastMethod=False,
                                                        globalAlign=True,
                                                        matchscore=1,
                                                        mismatchscore=-1,
                                                        gapscore=-2)

        graph.incorporateSeqAlignment(alignment, sequence, str(i))

    return graph
Ejemplo n.º 4
0
    def generatePoaGraph(self):
        """
        Given list of subreads, generate MSA using POA graphs
        """
        subreads = list(self.subreads)
        root_subread = subreads.pop()
        self.root_subread = root_subread
        root_seq = root_subread.read(aligned=False)
        root_label = root_subread.qName
        graph = poagraph.POAGraph(root_seq, label=root_label)
        for subread in subreads:
            subread_seq = self._check_direction(subread.read(aligned=False),
                                                root_seq)

            subread_label = subread.qName
            alignment = seqgraphalignment.SeqGraphAlignment(subread_seq,
                                                            graph,
                                                            fastMethod=True,
                                                            globalAlign=True)
            graph.incorporateSeqAlignment(alignment,
                                          subread_seq,
                                          label=subread_label)

        return graph
Ejemplo n.º 5
0
#!/usr/bin/env python
from __future__ import print_function
import argparse
import sys
import numpy
import poagraph
import seqgraphalignment
import simplefasta

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('-g','--globalAlign', action='store_true', help='Global alignment, or (default) local alignment')
    args = parser.parse_args()

    seqNo = 0
    fasta = simplefasta.readfasta(args.infile)
    graph = poagraph.POAGraph(fasta[0][1], fasta[0][0])
    for label, sequence in fasta[1:]:
        alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph, globalAlign=args.globalAlign)
        graph.incorporateSeqAlignment(alignment, sequence, label)
    alignments = graph.generateAlignmentStrings()
    for label,alignstring in alignments:
        print("{0:15s} {1:s}".format(label, alignstring))

    
def InfoShield_MDL(pads, output_path):
    init_cost = prev_total_cost = np.sum(
        [sequence_cost(s) for _, s in pads.items()]) + len(pads)
    gid_arr = np.array([l for l, _ in pads.items()])

    start0 = time.time()
    temp_arr, cond_arr, temp_dict, iter = [], [], {}, 0
    while len(gid_arr) > 0:
        iter += 1

        graph, gid = poagraph.POAGraph(pads[gid_arr[0]], gid_arr[0]), [0]
        seq_total_cost = sequence_cost(pads[gid_arr[0]])
        graph_0 = copy.deepcopy(graph)

        start1 = time.time()
        for idx, label in enumerate(gid_arr[1:]):
            sequence = pads[label]
            alignment = seqgraphalignment.SeqGraphAlignment(sequence, graph_0)
            align_mdl, _ = alignment.alignment_encoding_cost()
            seq_cost = sequence_cost(sequence)

            if align_mdl < seq_cost:
                gid.append(idx + 1)
                alignment = seqgraphalignment.SeqGraphAlignment(
                    sequence, graph)
                graph.incorporateSeqAlignment(alignment, sequence, label)
                seq_total_cost += seq_cost
        end1 = time.time()

        if len(gid) > 1:
            template, min_cost = dichotomous_search(pads, gid_arr, gid, graph)
            template = slot_identify(pads, gid_arr, gid, template)

            align_cost, c_arr = 0, []
            for id in gid:
                sequence = pads[gid_arr[id]]
                alignment = seqgraphalignment.SeqGraphAlignment(
                    sequence, template)
                cost, cond = alignment.alignment_encoding_cost()
                align_cost += cost
                c_arr.append(cond)

            total_cost = prev_total_cost - seq_total_cost
            if len(temp_arr) != 0:
                total_cost -= log_star(len(temp_arr)) + len(gid_arr) * ceil(
                    np.log2(len(temp_arr)))
            total_cost += (len(gid_arr) + len(gid)) * ceil(
                np.log2(len(temp_arr) + 1))
            total_cost += log_star(len(temp_arr) +
                                   1) + template.encoding_cost() + align_cost

            ### Check whether total cost decreases by this template
            if total_cost < prev_total_cost:
                prev_total_cost = total_cost
                temp_arr.append(template)
                cond_arr.append(c_arr)
                temp_dict[len(temp_arr)] = gid_arr[gid]

        ### Delete the assigned sequences
        gid_arr = np.delete(gid_arr, gid)

    end0 = time.time()
    output_results(temp_arr, cond_arr, output_path)
    return (init_cost - prev_total_cost) / init_cost, temp_dict, end0 - start0