Python Alignmentの例、praline.container.Alignment Pythonの例

コード例 #1

0

ファイルを表示

ファイル: preprofile.py プロジェクト: ibivu/PRALINE

    def execute(self, master_sequence, slave_sequences, track_id_sets,
                score_matrices):
        index = self.manager.index
        seq_master = master_sequence
        seq_slaves = slave_sequences

        score_threshold = self.environment['score_threshold']
        gap_series = np.array(self.environment['gap_series'], dtype=np.float32)
        iterations = self.environment['waterman_eggert_iterations']

        path = np.arange(len(seq_master) + 1).reshape(len(seq_master) + 1, 1)
        alignment = Alignment([seq_master], path)

        for j, seq_slave in enumerate(seq_slaves):
            zero_idxs = []
            for n in range(iterations):
                sub_env = self.environment['aligner_env']
                sub_component = index.resolve(self.environment['aligner'])
                root_env = self.environment
                execution = Execution(self.manager, self.tag)
                task = execution.add_task(sub_component)
                task.environment(root_env, sub_env)
                task.inputs(mode="local",
                            sequence_one=seq_master,
                            sequence_two=seq_slave,
                            track_id_sets_one=track_id_sets,
                            track_id_sets_two=track_id_sets,
                            score_matrices=score_matrices,
                            zero_idxs=zero_idxs)

                for msg in execution.run():
                    yield msg
                outputs = execution.outputs[0]

                path = outputs['alignment'].path
                min_x = min(x for x, y in path)
                max_x = max(x for x, y in path)
                min_y = min(y for x, y in path)
                max_y = max(y for x, y in path)

                for x in range(min_x, max_x + 1):
                    for y in range(min_y, max_y + 1):
                        zero_idxs.append((x, y))

                score = outputs['score']
                if score_threshold is None or score >= score_threshold:
                    path = compress_path(np.array(path), 0)
                    path = extend_path_local(path, len(seq_master), 0)

                    merge_range = np.arange(len(seq_slave) + 1)
                    merge_path = merge_range.reshape(len(seq_slave) + 1, 1)
                    merge_alignment = Alignment([seq_slave], merge_path)
                    alignment = alignment.merge(merge_alignment, path)

            yield ProgressMessage((j + 1) / len(seq_slaves))

        yield CompleteMessage({'alignment': alignment})

コード例 #2

0

ファイルを表示

ファイル: preprofile.py プロジェクト: ibivu/PRALINE

    def execute(self, master_sequence, slave_sequences, track_id_sets,
                score_matrices):
        path = np.array([i for i in range(len(master_sequence) + 1)])
        path = path.reshape(len(master_sequence) + 1, 1)
        alignment = Alignment([master_sequence], path)

        yield CompleteMessage({'alignment': alignment})

コード例 #3

0

ファイルを表示

ファイル: preprofile.py プロジェクト: ibivu/PRALINE

    def execute(self, master_sequence, slave_sequences, track_id_sets,
                score_matrices):
        index = self.manager.index

        seq_master = master_sequence
        seq_slaves = slave_sequences

        gap_series = np.array(self.environment['gap_series'], dtype=int)
        score_threshold = self.environment['score_threshold']

        path = np.arange(len(seq_master) + 1).reshape(len(seq_master) + 1, 1)
        alignment = Alignment([seq_master], path)

        for j, seq_slave in enumerate(seq_slaves):
            sub_env = self.environment['aligner_env']
            sub_component = index.resolve(self.environment['aligner'])
            root_env = self.environment
            execution = Execution(self.manager, self.tag)
            task = execution.add_task(sub_component)
            task.environment(root_env, sub_env)
            task.inputs(mode="global",
                        sequence_one=seq_master,
                        sequence_two=seq_slave,
                        track_id_sets_one=track_id_sets,
                        track_id_sets_two=track_id_sets,
                        score_matrices=score_matrices)

            for msg in execution.run():
                yield msg
            outputs = execution.outputs[0]

            score = outputs['score']
            if score_threshold is None or score >= score_threshold:
                path = outputs['alignment'].path
                path = compress_path(np.array(path), 0)

                merge_range = np.arange(len(seq_slave) + 1)
                merge_path = merge_range.reshape(len(seq_slave) + 1, 1)
                merge_alignment = Alignment([seq_slave], merge_path)
                alignment = alignment.merge(merge_alignment, path)

            yield ProgressMessage((j + 1) / len(seq_slaves))

        yield CompleteMessage({'alignment': alignment})

コード例 #4

0

ファイルを表示

    def execute(self, sequences, track_id_sets, score_matrices):
        debug = self.environment['debug']
        merge_mode = self.environment['merge_mode']
        dist_mode = self.environment['dist_mode']
        log_track_ids = self.environment['log_track_ids']

        if debug > 0:
            log = LogBundle()
            msg = "Entering component '{0}'".format(self.tid)
            log.message(ROOT_LOG_NAME, msg)

            msg = "Distance mode = '{0}', merge mode = '{1}'"
            msg = msg.format(dist_mode, merge_mode)
            log.message(ROOT_LOG_NAME, msg)

        if merge_mode not in {'global', 'semiglobal', 'semiglobal_auto'}:
            msg = "unknown merge mode '{0}'".format(merge_mode)
            raise ComponentError(msg)
        if dist_mode not in {'global', 'semiglobal', 'semiglobal_auto'}:
            msg = "unknown distance mode '{0}'".format(dist_mode)
            raise ComponentError(msg)

        index = self.manager.index

        seqs = sequences

        alignments = {
            i: Alignment([seq],
                         np.arange(len(seq) + 1).reshape(len(seq) + 1, 1))
            for i, seq in enumerate(seqs)
        }
        clusters = self._initial_clusters(seqs, track_id_sets)

        cur_step = 0
        total_steps = len(clusters) - 1
        while len(clusters) > 1:
            if debug > 0:
                msg = "Step {0} of {1}".format(cur_step + 1, total_steps)
                log.message(ROOT_LOG_NAME, msg)

            for msg in self._merge_indices(clusters, dist_mode, track_id_sets,
                                           score_matrices):
                yield msg
            i, j = self._merge_i, self._merge_j

            alignment_one = alignments[i]
            alignment_two = alignments[j]

            cluster_one = clusters[i]
            cluster_two = clusters[j]

            if debug > 0:
                msg = "Merging cluster {0} into {1}".format(j, i)
                log.message(ROOT_LOG_NAME, msg)
                header_fmt = "Cluster {0}:"
                name_fmt = "\t{0}"
                log.message(ROOT_LOG_NAME, header_fmt.format(j))
                for item in alignment_two.items:
                    log.message(ROOT_LOG_NAME, name_fmt.format(item.name))
                log.message(ROOT_LOG_NAME, header_fmt.format(i))
                for item in alignment_one.items:
                    log.message(ROOT_LOG_NAME, name_fmt.format(item.name))

            if debug > 1:
                s = 'step{0}_input_{1}_track_{2}.aln'
                for track_id in log_track_ids:
                    input_one_filename = s.format(cur_step + 1, 1, track_id)
                    input_two_filename = s.format(cur_step + 1, 2, track_id)
                    write_alignment_clustal(log.path(input_one_filename),
                                            alignment_one, track_id, None)
                    write_alignment_clustal(log.path(input_two_filename),
                                            alignment_two, track_id, None)

            if merge_mode == "semiglobal":
                align_mode = "semiglobal_both"
            elif merge_mode == "global":
                align_mode = "global"
            elif merge_mode == "semiglobal_auto":
                align_mode = auto_align_mode(cluster_one, cluster_two)

            sub_env = self.environment['aligner_env']
            sub_component = index.resolve(self.environment['aligner'])
            root_env = self.environment
            execution = Execution(self.manager, self.tag)
            task = execution.add_task(sub_component)
            task.environment(root_env, sub_env)
            task.inputs(mode=align_mode,
                        sequence_one=cluster_one,
                        sequence_two=cluster_two,
                        track_id_sets_one=track_id_sets,
                        track_id_sets_two=track_id_sets,
                        score_matrices=score_matrices)

            if debug > 0:
                msg = "Starting task '{0}' for pairwise alignment"
                msg = msg.format(task.tag)
                log.message(ROOT_LOG_NAME, msg)

            for msg in execution.run():
                yield msg
            outputs = execution.outputs[0]

            if debug > 0:
                msg = "Alignment score = {0}".format(outputs["score"])
                log.message(ROOT_LOG_NAME, msg)

            path_profile = np.array(outputs['alignment'].path)

            replace_tracks = []
            for track_id_set in track_id_sets:
                for track_id in track_id_set:
                    track_one = cluster_one.get_track(track_id)
                    track_two = cluster_two.get_track(track_id)

                    track_new = track_one.merge(track_two, path_profile)

                    replace_tracks.append((track_id, track_new))

                    cluster_one.del_track(track_id)

            for track_id, track_new in replace_tracks:
                cluster_one.add_track(track_id, track_new)

            alignments[i] = alignment_one.merge(alignment_two, path_profile)

            if debug > 1:
                for track_id in log_track_ids:
                    s = 'step{0}_output_track_{1}.aln'
                    output_filename = s.format(cur_step + 1, track_id)
                    write_alignment_clustal(log.path(output_filename),
                                            alignments[i], track_id, None)

            del clusters[j]
            del alignments[j]

            cur_step += 1
            yield ProgressMessage(progress=cur_step / total_steps)

        if debug > 0:
            msg = "Done!"
            log.message(ROOT_LOG_NAME, msg)

            archive_path = log.archive()
            log.delete()

            yield LogMessage(path_to_url(archive_path))

        yield CompleteMessage(
            outputs={'alignment': list(alignments.values())[0]})

コード例 #5

0

ファイルを表示

def load_alignment_fasta(f, alphabet, encoding='utf-8'):
    lines = _get_lines(f, encoding)

    headers = []
    aln_seqs = []

    header = None
    aln_seq = ""
    for line in lines:
        if line.startswith('>'):
            if header and len(seq):
                headers.append(header)
                aln_seqs.append(seq)
            header = line[1:].rstrip()
            seq = ""
            continue

        seq += line.strip()

    if header and len(seq):
        headers.append(header)
        aln_seqs.append(seq)

    consensus_len = None
    for aln_seq in aln_seqs:
        if consensus_len is None:
            consensus_len = len(aln_seq)
        else:
            if len(aln_seq) != consensus_len:
                s = "length {0} does not match consensus length {1}"
                s = s.format(len(aln_seq), consensus_len)

                raise DataError(s)

    if consensus_len is None:
        s = "the input file contains no records"
        raise DataError(s)

    seqs = [[] for aln_seq in aln_seqs]

    path = np.empty((consensus_len + 1, len(aln_seqs)), dtype=int)
    path[0, :] = 0
    for i in range(1, path.shape[0]):
        for j in range(path.shape[1]):
            sym = aln_seqs[j][i - 1]
            if sym == '-':
                path[i, j] = path[i-1, j]
            else:
                path[i, j] = path[i-1, j] + 1
                seqs[j].append(sym)

    seq_objs = []
    for i, seq in enumerate(seqs):
        track = PlainTrack(seq, alphabet)
        header = headers[i]

        seq_obj = Sequence(header, [(TRACK_ID_INPUT, track)])
        seq_objs.append(seq_obj)

    alignment = Alignment(seq_objs, path)

    return alignment

コード例 #6

0

ファイルを表示

ファイル: manager.py プロジェクト: ManyCore-NLeSC/whole-genome-tool

    def execute_many(self, requestss, parent_tag):

        not_msa = not use_our_stuff
        for tid, inputs, tag, env in requestss:
            if not tid in _INTERCEPT_TIDS:
                not_msa = True
        if not_msa:
            gen = super(ConstellationManager,
                        self).execute_many(requestss, parent_tag)
            for message in gen:
                yield message

            return
        start = time.time()

        trees = []
        # We're not handling these tasks in this manager, so execute them
        # normally using the functionality of the superclass.
        for tid, inputs, tag, env in requestss:

            # We want to intercept execution of these tasks and send them off
            # to constellation. First pass a message saying we've begun
            # executing the tasks.
            for tid, inputs, tag, env in requestss:
                begin_message = BeginMessage(parent_tag)
                begin_message.tag = tag
                yield begin_message

            # TODO: convert task inputs to constellation format, send it to
            # Constellation and wait for completion. Yield ProgressMessage
            # instances to report progress to the UI if applicable.

        # We're not handling these tasks in this manager, so execute them
        # normally using the functionality of the superclass.
            score_matrices = inputs.get("score_matrices", None)
            seqs = inputs.get("sequences", None)
            tree = inputs.get("guide_tree", None)
            track_id_sets = inputs.get("track_id_sets", None)
            gap_series = env['gap_series']
            merge_mode = env['merge_mode']

            s = [sm.matrix.astype(np.float32) for sm in score_matrices]
            uniquestr = str(time.time())
            costName = "cost" + uniquestr
            treeName = "tree" + uniquestr
            sendCosts(costName, s)

            uniquestr = str(time.time())
            alignmentsi = {i: [seq] for i, seq in enumerate(seqs)}
            for i, j in tree.merge_orders:
                alignmentsi[i] += alignmentsi[j]

            data = ' '.join(
                [str(i) + "," + str(j) for i, j in tree.merge_orders])

            start_gap, extend_gap = 0, 0
            if len(gap_series) == 2:
                start_gap, extend_gap = gap_series[0], gap_series[1]
            elif len(gap_series) == 1:
                start_gap = extend_gap = gap_series[0]
            else:
                raise ComponentError("NO GAP COST!!!")

            reqstring = "/register/tree/" + treeName + "/" + str(
                len(seqs)) + "/" + costName + "/" + merge_mode + "/" + str(
                    start_gap) + "/" + str(extend_gap)
            #print data
            req = requests.post(SERVER + reqstring, data=data)

            for i, seq in enumerate(seqs):

                sendSequence(treeName, i,
                             [seq.get_track(t[0]) for t in track_id_sets], s)
            trees += [treeName]

        end = time.time()
        print "Sending jobs took " + (str(end - start)) + " seconds"
        start = time.time()
        # for tree in trees:
        #  print(tree)
        requests.get(SERVER + "/processtrees")

        first = True
        for (tid, inputs, tag, env), tree in zip(requestss, trees):
            while True:
                req = requests.get(SERVER + "/retrieve/steps/" + tree)
                # print("not yet" + tree + " " + str(req.status_code))
                if req.status_code == 200:
                    if first:
                        end = time.time()
                        print "Waiting for result " + (str(end -
                                                           start)) + " seconds"
                        start = end
                        first = False
                    res = np.array([[int(d) for d in c.split(';')]
                                    for c in req.text.split(' ')])
                    outputs = {}
                    outputs['alignment'] = Alignment(alignmentsi[0], res)

                    complete_message = CompleteMessage(outputs=outputs)
                    complete_message.tag = tag
                    yield complete_message
                    break
        end = time.time()
        print "Obtaining results took " + (str(end - start)) + " seconds"

コード例 #7

0

ファイルを表示

    def execute(self, mode, sequence_one, sequence_two, match_score_model,
                gap_score_model_one, gap_score_model_two, zero_idxs):
        debug = self.environment['debug']
        accelerate = self.environment['accelerate']

        if debug > 0:
            log = LogBundle()
            msg = "Entering component '{0}'".format(self.tid)
            log.message(ROOT_LOG_NAME, msg)

            log.message(ROOT_LOG_NAME, "Alignment mode: '{0}'".format(mode))

            msg = "Sequence one: '{0}', sequence two: '{1}'"
            msg = msg.format(sequence_one.name, sequence_two.name)
            log.message(ROOT_LOG_NAME, msg)

        # We only implement a limited set of alignment modes so far.
        if mode == "local":
            alignment_type = "local"
        elif mode == "semiglobal_both":
            alignment_type = "semiglobal_both"
        elif mode == "semiglobal_one":
            alignment_type = "semiglobal_one"
        elif mode == "semiglobal_two":
            alignment_type = "semiglobal_two"
        elif mode == "global":
            alignment_type = "global"
        else:
            s = "unknown alignment mode: '{0}'".format(mode)
            raise ComponentError(s)

        try:
            if accelerate:
                align_fun = _CEXT_ALIGN_FUNCTIONS[alignment_type]
            else:
                align_fun = _ALIGN_FUNCTIONS[alignment_type]
        except KeyError:
            s = "alignment not implemented for {0}"
            s = s.format(alignment_type)
            raise ComponentError(s)

        # Retrieve score arrays from model objects.
        m = match_score_model.scores
        g1 = gap_score_model_one.scores
        g2 = gap_score_model_two.scores

        # Setup the arrays which we will pass to the C alignment component.
        # We pre-allocate everything the C code needs in terms of memory here
        # since dealing with python memory management from C is kind of a
        # pain.
        #
        # NOTE: the C side of things performs no type or bounds checking
        # at all. If you pass a smaller array than it is expecting or an
        # array of a different data type, it'll happily read outside of the
        # buffer and crash PRALINE. So you need to be very careful here.
        shape = (m.shape[0] + 1, m.shape[1] + 1)
        o = np.zeros((shape[0], shape[1], 3), dtype=np.float32)
        t = np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
        z = np.zeros(shape, dtype=np.uint8)
        if zero_idxs is not None:
            for idx in zero_idxs:
                z[idx] = 1

        # Initialize the matrices for the dynamic programming function, as well
        # as the traceback matrices.
        o[:, 0, :] = MINUS_INFINITY
        o[0, :, :] = MINUS_INFINITY
        o[0, 0, 0] = 0

        if mode in {"semiglobal_both", "semiglobal_one"}:
            o[:, 0, 1] = 0
        else:
            o[0, 0, 1] = g1[0, 0] - g1[0, 1]
            o[1:, 0, 1] = (np.arange(o.shape[0] - 1) * g1[:, 1]) + g1[0, 0]

            t[1:, 0, 1] = TRACEBACK_INSERT_UP_EXTEND

        if mode in {"semiglobal_both", "semiglobal_two"}:
            o[0, :, 2] = 0
        else:
            o[0, 0, 2] = g2[0, 0] - g2[0, 1]
            o[0, 1:, 2] = (np.arange(o.shape[1] - 1) * g2[:, 1]) + g2[0, 0]

            t[0, 1:, 2] = TRACEBACK_INSERT_LEFT_EXTEND

        # Initialize the traceback matrices.
        align_fun(m, g1, g2, o, t, z)

        if debug > 1:
            log.message(ROOT_LOG_NAME, "Dumping DP & traceback matrices...")

            np.savetxt(log.path("dp_0_matrix.csv"), o[:, :, 0], delimiter=",")
            np.savetxt(log.path("dp_1_matrix.csv"), o[:, :, 1], delimiter=",")
            np.savetxt(log.path("dp_2_matrix.csv"), o[:, :, 2], delimiter=",")

            np.savetxt(log.path("tb_0_matrix.csv"), t[:, :, 0], delimiter=",")
            np.savetxt(log.path("tb_1_matrix.csv"), t[:, :, 1], delimiter=",")
            np.savetxt(log.path("tb_2_matrix.csv"), t[:, :, 2], delimiter=",")

        if mode == "local":
            cell = np.unravel_index(o.argmax(), o.shape)
            score = o[cell]
            paths = get_paths(t, cell=cell)
        elif mode in {"semiglobal_both", "semiglobal_one", "semiglobal_two"}:
            n, m, _ = o.shape
            last_row = o[n - 1, :, :]
            last_col = o[:, m - 1, :]
            last_row_max = last_row.max()
            last_col_max = last_col.max()
            trace_from_row = mode in {"semiglobal_both", "semiglobal_two"}

            if last_row_max > last_col_max and trace_from_row:
                for i, j in itertools.product(range(m - 1, -1, -1), range(3)):
                    if last_row[i, j] == last_row_max:
                        cell = (n - 1, i, j)
                        break
            else:
                for i, j in itertools.product(range(n - 1, -1, -1), range(3)):
                    if last_col[i, j] == last_col_max:
                        cell = (i, m - 1, j)
                        break

            score = o[cell]
            paths = [np.array(path) for path in get_paths(t, cell=cell)]
            paths = [extend_path_semiglobal(path, (n, m)) for path in paths]
        elif mode == "global":
            y, x = o.shape[0] - 1, o.shape[1] - 1
            cell = (y, x, np.argmax(o[y, x, :]))
            score = o[cell]
            paths = get_paths(t, cell=cell)

        alignment = Alignment([sequence_one, sequence_two], paths[0])
        outputs = {'alignment': alignment, 'score': float(score)}

        if debug > 0:
            log.message(ROOT_LOG_NAME, "Alignment score: {0}".format(score))

            msg = "Done!"
            log.message(ROOT_LOG_NAME, msg)

            archive_path = log.archive()
            log.delete()

            yield LogMessage(path_to_url(archive_path))

        yield CompleteMessage(outputs=outputs)