def execute(self, master_sequence, slave_sequences, track_id_sets, score_matrices): index = self.manager.index seq_master = master_sequence seq_slaves = slave_sequences score_threshold = self.environment['score_threshold'] gap_series = np.array(self.environment['gap_series'], dtype=np.float32) iterations = self.environment['waterman_eggert_iterations'] path = np.arange(len(seq_master) + 1).reshape(len(seq_master) + 1, 1) alignment = Alignment([seq_master], path) for j, seq_slave in enumerate(seq_slaves): zero_idxs = [] for n in range(iterations): sub_env = self.environment['aligner_env'] sub_component = index.resolve(self.environment['aligner']) root_env = self.environment execution = Execution(self.manager, self.tag) task = execution.add_task(sub_component) task.environment(root_env, sub_env) task.inputs(mode="local", sequence_one=seq_master, sequence_two=seq_slave, track_id_sets_one=track_id_sets, track_id_sets_two=track_id_sets, score_matrices=score_matrices, zero_idxs=zero_idxs) for msg in execution.run(): yield msg outputs = execution.outputs[0] path = outputs['alignment'].path min_x = min(x for x, y in path) max_x = max(x for x, y in path) min_y = min(y for x, y in path) max_y = max(y for x, y in path) for x in range(min_x, max_x + 1): for y in range(min_y, max_y + 1): zero_idxs.append((x, y)) score = outputs['score'] if score_threshold is None or score >= score_threshold: path = compress_path(np.array(path), 0) path = extend_path_local(path, len(seq_master), 0) merge_range = np.arange(len(seq_slave) + 1) merge_path = merge_range.reshape(len(seq_slave) + 1, 1) merge_alignment = Alignment([seq_slave], merge_path) alignment = alignment.merge(merge_alignment, path) yield ProgressMessage((j + 1) / len(seq_slaves)) yield CompleteMessage({'alignment': alignment})
def execute(self, master_sequence, slave_sequences, track_id_sets, score_matrices): path = np.array([i for i in range(len(master_sequence) + 1)]) path = path.reshape(len(master_sequence) + 1, 1) alignment = Alignment([master_sequence], path) yield CompleteMessage({'alignment': alignment})
def execute(self, master_sequence, slave_sequences, track_id_sets, score_matrices): index = self.manager.index seq_master = master_sequence seq_slaves = slave_sequences gap_series = np.array(self.environment['gap_series'], dtype=int) score_threshold = self.environment['score_threshold'] path = np.arange(len(seq_master) + 1).reshape(len(seq_master) + 1, 1) alignment = Alignment([seq_master], path) for j, seq_slave in enumerate(seq_slaves): sub_env = self.environment['aligner_env'] sub_component = index.resolve(self.environment['aligner']) root_env = self.environment execution = Execution(self.manager, self.tag) task = execution.add_task(sub_component) task.environment(root_env, sub_env) task.inputs(mode="global", sequence_one=seq_master, sequence_two=seq_slave, track_id_sets_one=track_id_sets, track_id_sets_two=track_id_sets, score_matrices=score_matrices) for msg in execution.run(): yield msg outputs = execution.outputs[0] score = outputs['score'] if score_threshold is None or score >= score_threshold: path = outputs['alignment'].path path = compress_path(np.array(path), 0) merge_range = np.arange(len(seq_slave) + 1) merge_path = merge_range.reshape(len(seq_slave) + 1, 1) merge_alignment = Alignment([seq_slave], merge_path) alignment = alignment.merge(merge_alignment, path) yield ProgressMessage((j + 1) / len(seq_slaves)) yield CompleteMessage({'alignment': alignment})
def execute(self, sequences, track_id_sets, score_matrices): debug = self.environment['debug'] merge_mode = self.environment['merge_mode'] dist_mode = self.environment['dist_mode'] log_track_ids = self.environment['log_track_ids'] if debug > 0: log = LogBundle() msg = "Entering component '{0}'".format(self.tid) log.message(ROOT_LOG_NAME, msg) msg = "Distance mode = '{0}', merge mode = '{1}'" msg = msg.format(dist_mode, merge_mode) log.message(ROOT_LOG_NAME, msg) if merge_mode not in {'global', 'semiglobal', 'semiglobal_auto'}: msg = "unknown merge mode '{0}'".format(merge_mode) raise ComponentError(msg) if dist_mode not in {'global', 'semiglobal', 'semiglobal_auto'}: msg = "unknown distance mode '{0}'".format(dist_mode) raise ComponentError(msg) index = self.manager.index seqs = sequences alignments = { i: Alignment([seq], np.arange(len(seq) + 1).reshape(len(seq) + 1, 1)) for i, seq in enumerate(seqs) } clusters = self._initial_clusters(seqs, track_id_sets) cur_step = 0 total_steps = len(clusters) - 1 while len(clusters) > 1: if debug > 0: msg = "Step {0} of {1}".format(cur_step + 1, total_steps) log.message(ROOT_LOG_NAME, msg) for msg in self._merge_indices(clusters, dist_mode, track_id_sets, score_matrices): yield msg i, j = self._merge_i, self._merge_j alignment_one = alignments[i] alignment_two = alignments[j] cluster_one = clusters[i] cluster_two = clusters[j] if debug > 0: msg = "Merging cluster {0} into {1}".format(j, i) log.message(ROOT_LOG_NAME, msg) header_fmt = "Cluster {0}:" name_fmt = "\t{0}" log.message(ROOT_LOG_NAME, header_fmt.format(j)) for item in alignment_two.items: log.message(ROOT_LOG_NAME, name_fmt.format(item.name)) log.message(ROOT_LOG_NAME, header_fmt.format(i)) for item in alignment_one.items: log.message(ROOT_LOG_NAME, name_fmt.format(item.name)) if debug > 1: s = 'step{0}_input_{1}_track_{2}.aln' for track_id in log_track_ids: input_one_filename = s.format(cur_step + 1, 1, track_id) input_two_filename = s.format(cur_step + 1, 2, track_id) write_alignment_clustal(log.path(input_one_filename), alignment_one, track_id, None) write_alignment_clustal(log.path(input_two_filename), alignment_two, track_id, None) if merge_mode == "semiglobal": align_mode = "semiglobal_both" elif merge_mode == "global": align_mode = "global" elif merge_mode == "semiglobal_auto": align_mode = auto_align_mode(cluster_one, cluster_two) sub_env = self.environment['aligner_env'] sub_component = index.resolve(self.environment['aligner']) root_env = self.environment execution = Execution(self.manager, self.tag) task = execution.add_task(sub_component) task.environment(root_env, sub_env) task.inputs(mode=align_mode, sequence_one=cluster_one, sequence_two=cluster_two, track_id_sets_one=track_id_sets, track_id_sets_two=track_id_sets, score_matrices=score_matrices) if debug > 0: msg = "Starting task '{0}' for pairwise alignment" msg = msg.format(task.tag) log.message(ROOT_LOG_NAME, msg) for msg in execution.run(): yield msg outputs = execution.outputs[0] if debug > 0: msg = "Alignment score = {0}".format(outputs["score"]) log.message(ROOT_LOG_NAME, msg) path_profile = np.array(outputs['alignment'].path) replace_tracks = [] for track_id_set in track_id_sets: for track_id in track_id_set: track_one = cluster_one.get_track(track_id) track_two = cluster_two.get_track(track_id) track_new = track_one.merge(track_two, path_profile) replace_tracks.append((track_id, track_new)) cluster_one.del_track(track_id) for track_id, track_new in replace_tracks: cluster_one.add_track(track_id, track_new) alignments[i] = alignment_one.merge(alignment_two, path_profile) if debug > 1: for track_id in log_track_ids: s = 'step{0}_output_track_{1}.aln' output_filename = s.format(cur_step + 1, track_id) write_alignment_clustal(log.path(output_filename), alignments[i], track_id, None) del clusters[j] del alignments[j] cur_step += 1 yield ProgressMessage(progress=cur_step / total_steps) if debug > 0: msg = "Done!" log.message(ROOT_LOG_NAME, msg) archive_path = log.archive() log.delete() yield LogMessage(path_to_url(archive_path)) yield CompleteMessage( outputs={'alignment': list(alignments.values())[0]})
def load_alignment_fasta(f, alphabet, encoding='utf-8'): lines = _get_lines(f, encoding) headers = [] aln_seqs = [] header = None aln_seq = "" for line in lines: if line.startswith('>'): if header and len(seq): headers.append(header) aln_seqs.append(seq) header = line[1:].rstrip() seq = "" continue seq += line.strip() if header and len(seq): headers.append(header) aln_seqs.append(seq) consensus_len = None for aln_seq in aln_seqs: if consensus_len is None: consensus_len = len(aln_seq) else: if len(aln_seq) != consensus_len: s = "length {0} does not match consensus length {1}" s = s.format(len(aln_seq), consensus_len) raise DataError(s) if consensus_len is None: s = "the input file contains no records" raise DataError(s) seqs = [[] for aln_seq in aln_seqs] path = np.empty((consensus_len + 1, len(aln_seqs)), dtype=int) path[0, :] = 0 for i in range(1, path.shape[0]): for j in range(path.shape[1]): sym = aln_seqs[j][i - 1] if sym == '-': path[i, j] = path[i-1, j] else: path[i, j] = path[i-1, j] + 1 seqs[j].append(sym) seq_objs = [] for i, seq in enumerate(seqs): track = PlainTrack(seq, alphabet) header = headers[i] seq_obj = Sequence(header, [(TRACK_ID_INPUT, track)]) seq_objs.append(seq_obj) alignment = Alignment(seq_objs, path) return alignment
def execute_many(self, requestss, parent_tag): not_msa = not use_our_stuff for tid, inputs, tag, env in requestss: if not tid in _INTERCEPT_TIDS: not_msa = True if not_msa: gen = super(ConstellationManager, self).execute_many(requestss, parent_tag) for message in gen: yield message return start = time.time() trees = [] # We're not handling these tasks in this manager, so execute them # normally using the functionality of the superclass. for tid, inputs, tag, env in requestss: # We want to intercept execution of these tasks and send them off # to constellation. First pass a message saying we've begun # executing the tasks. for tid, inputs, tag, env in requestss: begin_message = BeginMessage(parent_tag) begin_message.tag = tag yield begin_message # TODO: convert task inputs to constellation format, send it to # Constellation and wait for completion. Yield ProgressMessage # instances to report progress to the UI if applicable. # We're not handling these tasks in this manager, so execute them # normally using the functionality of the superclass. score_matrices = inputs.get("score_matrices", None) seqs = inputs.get("sequences", None) tree = inputs.get("guide_tree", None) track_id_sets = inputs.get("track_id_sets", None) gap_series = env['gap_series'] merge_mode = env['merge_mode'] s = [sm.matrix.astype(np.float32) for sm in score_matrices] uniquestr = str(time.time()) costName = "cost" + uniquestr treeName = "tree" + uniquestr sendCosts(costName, s) uniquestr = str(time.time()) alignmentsi = {i: [seq] for i, seq in enumerate(seqs)} for i, j in tree.merge_orders: alignmentsi[i] += alignmentsi[j] data = ' '.join( [str(i) + "," + str(j) for i, j in tree.merge_orders]) start_gap, extend_gap = 0, 0 if len(gap_series) == 2: start_gap, extend_gap = gap_series[0], gap_series[1] elif len(gap_series) == 1: start_gap = extend_gap = gap_series[0] else: raise ComponentError("NO GAP COST!!!") reqstring = "/register/tree/" + treeName + "/" + str( len(seqs)) + "/" + costName + "/" + merge_mode + "/" + str( start_gap) + "/" + str(extend_gap) #print data req = requests.post(SERVER + reqstring, data=data) for i, seq in enumerate(seqs): sendSequence(treeName, i, [seq.get_track(t[0]) for t in track_id_sets], s) trees += [treeName] end = time.time() print "Sending jobs took " + (str(end - start)) + " seconds" start = time.time() # for tree in trees: # print(tree) requests.get(SERVER + "/processtrees") first = True for (tid, inputs, tag, env), tree in zip(requestss, trees): while True: req = requests.get(SERVER + "/retrieve/steps/" + tree) # print("not yet" + tree + " " + str(req.status_code)) if req.status_code == 200: if first: end = time.time() print "Waiting for result " + (str(end - start)) + " seconds" start = end first = False res = np.array([[int(d) for d in c.split(';')] for c in req.text.split(' ')]) outputs = {} outputs['alignment'] = Alignment(alignmentsi[0], res) complete_message = CompleteMessage(outputs=outputs) complete_message.tag = tag yield complete_message break end = time.time() print "Obtaining results took " + (str(end - start)) + " seconds"
def execute(self, mode, sequence_one, sequence_two, match_score_model, gap_score_model_one, gap_score_model_two, zero_idxs): debug = self.environment['debug'] accelerate = self.environment['accelerate'] if debug > 0: log = LogBundle() msg = "Entering component '{0}'".format(self.tid) log.message(ROOT_LOG_NAME, msg) log.message(ROOT_LOG_NAME, "Alignment mode: '{0}'".format(mode)) msg = "Sequence one: '{0}', sequence two: '{1}'" msg = msg.format(sequence_one.name, sequence_two.name) log.message(ROOT_LOG_NAME, msg) # We only implement a limited set of alignment modes so far. if mode == "local": alignment_type = "local" elif mode == "semiglobal_both": alignment_type = "semiglobal_both" elif mode == "semiglobal_one": alignment_type = "semiglobal_one" elif mode == "semiglobal_two": alignment_type = "semiglobal_two" elif mode == "global": alignment_type = "global" else: s = "unknown alignment mode: '{0}'".format(mode) raise ComponentError(s) try: if accelerate: align_fun = _CEXT_ALIGN_FUNCTIONS[alignment_type] else: align_fun = _ALIGN_FUNCTIONS[alignment_type] except KeyError: s = "alignment not implemented for {0}" s = s.format(alignment_type) raise ComponentError(s) # Retrieve score arrays from model objects. m = match_score_model.scores g1 = gap_score_model_one.scores g2 = gap_score_model_two.scores # Setup the arrays which we will pass to the C alignment component. # We pre-allocate everything the C code needs in terms of memory here # since dealing with python memory management from C is kind of a # pain. # # NOTE: the C side of things performs no type or bounds checking # at all. If you pass a smaller array than it is expecting or an # array of a different data type, it'll happily read outside of the # buffer and crash PRALINE. So you need to be very careful here. shape = (m.shape[0] + 1, m.shape[1] + 1) o = np.zeros((shape[0], shape[1], 3), dtype=np.float32) t = np.zeros((shape[0], shape[1], 3), dtype=np.uint8) z = np.zeros(shape, dtype=np.uint8) if zero_idxs is not None: for idx in zero_idxs: z[idx] = 1 # Initialize the matrices for the dynamic programming function, as well # as the traceback matrices. o[:, 0, :] = MINUS_INFINITY o[0, :, :] = MINUS_INFINITY o[0, 0, 0] = 0 if mode in {"semiglobal_both", "semiglobal_one"}: o[:, 0, 1] = 0 else: o[0, 0, 1] = g1[0, 0] - g1[0, 1] o[1:, 0, 1] = (np.arange(o.shape[0] - 1) * g1[:, 1]) + g1[0, 0] t[1:, 0, 1] = TRACEBACK_INSERT_UP_EXTEND if mode in {"semiglobal_both", "semiglobal_two"}: o[0, :, 2] = 0 else: o[0, 0, 2] = g2[0, 0] - g2[0, 1] o[0, 1:, 2] = (np.arange(o.shape[1] - 1) * g2[:, 1]) + g2[0, 0] t[0, 1:, 2] = TRACEBACK_INSERT_LEFT_EXTEND # Initialize the traceback matrices. align_fun(m, g1, g2, o, t, z) if debug > 1: log.message(ROOT_LOG_NAME, "Dumping DP & traceback matrices...") np.savetxt(log.path("dp_0_matrix.csv"), o[:, :, 0], delimiter=",") np.savetxt(log.path("dp_1_matrix.csv"), o[:, :, 1], delimiter=",") np.savetxt(log.path("dp_2_matrix.csv"), o[:, :, 2], delimiter=",") np.savetxt(log.path("tb_0_matrix.csv"), t[:, :, 0], delimiter=",") np.savetxt(log.path("tb_1_matrix.csv"), t[:, :, 1], delimiter=",") np.savetxt(log.path("tb_2_matrix.csv"), t[:, :, 2], delimiter=",") if mode == "local": cell = np.unravel_index(o.argmax(), o.shape) score = o[cell] paths = get_paths(t, cell=cell) elif mode in {"semiglobal_both", "semiglobal_one", "semiglobal_two"}: n, m, _ = o.shape last_row = o[n - 1, :, :] last_col = o[:, m - 1, :] last_row_max = last_row.max() last_col_max = last_col.max() trace_from_row = mode in {"semiglobal_both", "semiglobal_two"} if last_row_max > last_col_max and trace_from_row: for i, j in itertools.product(range(m - 1, -1, -1), range(3)): if last_row[i, j] == last_row_max: cell = (n - 1, i, j) break else: for i, j in itertools.product(range(n - 1, -1, -1), range(3)): if last_col[i, j] == last_col_max: cell = (i, m - 1, j) break score = o[cell] paths = [np.array(path) for path in get_paths(t, cell=cell)] paths = [extend_path_semiglobal(path, (n, m)) for path in paths] elif mode == "global": y, x = o.shape[0] - 1, o.shape[1] - 1 cell = (y, x, np.argmax(o[y, x, :])) score = o[cell] paths = get_paths(t, cell=cell) alignment = Alignment([sequence_one, sequence_two], paths[0]) outputs = {'alignment': alignment, 'score': float(score)} if debug > 0: log.message(ROOT_LOG_NAME, "Alignment score: {0}".format(score)) msg = "Done!" log.message(ROOT_LOG_NAME, msg) archive_path = log.archive() log.delete() yield LogMessage(path_to_url(archive_path)) yield CompleteMessage(outputs=outputs)