def execute(self, sequence, track_id): track = sequence.get_track(track_id) if track.alphabet.aid == ALPHABET_AA.aid: program = "blastp" elif track.alphabet.aid == ALPHABET_DNA.aid: program = "blastn" else: s = "BLAST+ will only accept inputs with an AA " \ "or DNA alphabet" raise ComponentError(s) if self.environment['blast_plus_root'] is None: s = "need to know where you have installed BLAST+, " \ "supply in shell variable BLAST_PLUS_ROOT or " \ "PRALINE environment option blast_plus_root" raise ComponentError(s) blast_root = self.environment['blast_plus_root'] program_path = os.path.join(blast_root, 'bin', program) db_program_path = os.path.join(blast_root, 'bin', 'blastdbcmd') temp_root = tempfile.mkdtemp() blast_output_path = os.path.join(temp_root, 'blast.out') input_fasta_path = os.path.join(temp_root, 'input.fasta') output_fasta_path = os.path.join(temp_root, 'output.fasta') entries_path = os.path.join(temp_root, 'entries.txt') db_name = self.environment['db_name'] num_seqs = self.environment['num_seqs'] max_evalue = self.environment['max_evalue'] try: write_sequence_fasta(input_fasta_path, [sequence], track_id) with open(blast_output_path, 'w') as fo: args = [ program_path, "-query", input_fasta_path, "-db", db_name, "-outfmt", "5" ] subprocess.check_call(args, stdout=fo) with open(blast_output_path) as fi, open(entries_path, 'w') as fo: for i, (id_, evalue) in enumerate(_parse_xml(fi)): if i >= num_seqs: break if evalue <= max_evalue: fo.write("{0}\n".format(id_)) with open(output_fasta_path, 'w') as fo: args = [ db_program_path, "-db", db_name, "-entry_batch", entries_path, "-target_only" ] subprocess.check_call(args, stdout=fo) seqs = load_sequence_fasta(output_fasta_path, track.alphabet) finally: shutil.rmtree(temp_root) yield CompleteMessage(outputs={'sequences': seqs})
def do_motif_annotation(args, env, manager, seqs, verbose, root_node): FMT_TRACK_ID = "{0}_{1}" track_scores = {} execution = Execution(manager, ROOT_TAG) seq_patterns = [] for seq in seqs: for pair in args.patterns: pattern = pair[0] if len(pair) > 1: score = float(pair[1]) else: score = None seq_patterns.append((seq, pattern, score)) component = PrositePatternAnnotator task = execution.add_task(component) task.environment(env) task.inputs(sequence=seq, pattern=pattern, track_id=TRACK_ID_INPUT) outputs = run(execution, verbose=verbose, root_node=root_node) for n, output in enumerate(outputs): seq, pattern, score = seq_patterns[n] track = output['prediction_track'] trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern) seq.add_track(trid, track) track_scores[trid] = score for pair in args.annotation_files: annotation_file = pair[0] if len(pair) > 1: score = float(pair[1]) else: score = None annotation_seqs = load_sequence_fasta(annotation_file, ALPHABET_PROSITE) name_tracks = {} for annotation_seq in annotation_seqs: track = annotation_seq.get_track(TRACK_ID_INPUT) name_tracks[annotation_seq.name] = track for seq in seqs: track = name_tracks[seq.name] trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file) seq.add_track(trid, track) track_scores[trid] = score return track_scores
def create_msa_input(job, args, manager, root_node): verbose = False alphabet = ALPHABET_DNA seqs = load_sequence_fasta(args.input, alphabet) # Load inputs and other data. if args.score_matrix is not None: score_matrix_file = args.score_matrix else: score_matrix_file = 'nucleotide' # Read score parameters. with open_resource(score_matrix_file, "matrices") as f: score_matrices = [load_score_matrix(f, alphabet=alphabet)] gap_series = [-float(x) for x in args.gap_penalties.split(",")] # Setup environment. keys = {} keys['gap_series'] = gap_series keys['debug'] = args.debug keys['merge_mode'] = 'global' keys['dist_mode'] = 'global' keys['accelerate'] = True env = Environment(keys=keys) # Initialize root node for output root_node = TaskNode(ROOT_TAG) # Annotate the motifs from the files and patterns. track_scores = do_motif_annotation(args, env, manager, seqs, verbose, root_node) # Build score matrices. motif_score_matrices = {} for trid, score in track_scores.iteritems(): if score is None: score = args.motif_match_score motif_score_matrices[trid] = get_motif_score_matrix( score, args.score_spacers) # Add all the new annotation tracks to the list of tracks to use # in the alignment. track_id_sets = [[TRACK_ID_INPUT]] for trid, track in seqs[0].tracks: if trid in motif_score_matrices: track_id_sets.append([trid]) score_matrices.append(motif_score_matrices[trid]) # Build initial sets of which sequences to align against every master # sequence. By default, we want to align every input sequence against # every other input sequence. master_slave_seqs = [] all_seqs = list(seqs) for master_seq in seqs: slave_seqs = [] for slave_seq in seqs: if slave_seq is not master_seq: slave_seqs.append(slave_seq) master_slave_seqs.append((master_seq, slave_seqs)) master_slave_alignments = do_master_slave_alignments( args, env, manager, master_slave_seqs, track_id_sets, score_matrices, verbose, root_node) # Build preprofiles from master-slave alignments. do_preprofiles(args, env, manager, master_slave_alignments, seqs, verbose, root_node) msa_track_id_sets = _replace_input_track_id(track_id_sets) return env, seqs, msa_track_id_sets, score_matrices
def main(): # Parse arguments. args = parse_args() verbose = not args.quiet or args.verbose # See if we're doing DNA or protein alignments. # TODO: if unspecified, autodetect this based on input file contents? alphabet = None if args.input_dna: alphabet = ALPHABET_DNA elif args.input_protein: alphabet = ALPHABET_AA # Setup the execution manager. index = TypeIndex() index.autoregister() if args.num_threads > 1: manager = ParallelExecutionManager(index, args.num_threads) else: manager = Manager(index) seqs = load_sequence_fasta(args.input, alphabet) # Load inputs and other data. if args.score_matrix is not None: score_matrix_file = args.score_matrix else: if alphabet == ALPHABET_AA: score_matrix_file = 'blosum62' elif alphabet == ALPHABET_DNA: score_matrix_file = 'nucleotide' # Read score parameters. with open_resource(score_matrix_file, "matrices") as f: score_matrices = [load_score_matrix(f, alphabet=alphabet)] gap_series = [-float(x) for x in args.gap_penalties.split(",")] # Setup environment. keys = {} keys['gap_series'] = gap_series keys['score_threshold'] = args.preprofile_score keys['linkage_method'] = args.tree_linkage keys['waterman_eggert_iterations'] = args.num_preprofile_alignments keys['debug'] = args.debug if args.merge_semiglobal: keys['merge_mode'] = 'semiglobal' keys['dist_mode'] = 'semiglobal' else: keys['merge_mode'] = 'global' keys['dist_mode'] = 'global' if args.no_accelerate: keys['accelerate'] = False else: keys['accelerate'] = True env = Environment(keys=keys) # Initialize root node for output root_node = TaskNode(ROOT_TAG) # Annotate the motifs from the files and patterns. track_scores = do_motif_annotation(args, env, manager, seqs, verbose, root_node) # Build score matrices. motif_score_matrices = {} for trid, score in six.iteritems(track_scores): if score is None: score = args.motif_match_score motif_score_matrices[trid] = get_motif_score_matrix(score, args.score_spacers) # Add all the new annotation tracks to the list of tracks to use # in the alignment. track_id_sets = [[TRACK_ID_INPUT]] for trid, track in seqs[0].tracks: if trid in motif_score_matrices: track_id_sets.append([trid]) score_matrices.append(motif_score_matrices[trid]) # Build initial sets of which sequences to align against every master # sequence. By default, we want to align every input sequence against # every other input sequence. master_slave_seqs = [] all_seqs = list(seqs) for master_seq in seqs: slave_seqs = [] for slave_seq in seqs: if slave_seq is not master_seq: slave_seqs.append(slave_seq) master_slave_seqs.append((master_seq, slave_seqs)) master_slave_alignments = do_master_slave_alignments(args, env, manager, master_slave_seqs, track_id_sets, score_matrices, verbose, root_node) # Build preprofiles from master-slave alignments. do_preprofiles(args, env, manager, master_slave_alignments, seqs, verbose, root_node) msa_track_id_sets = _replace_input_track_id(track_id_sets) # Do multiple sequence alignment from preprofile-annotated sequences. alignment = do_multiple_sequence_alignment(args, env, manager, seqs, msa_track_id_sets, score_matrices, verbose, root_node) # Write alignment to output file. outfmt = args.output_format if outfmt == 'fasta': write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT) elif outfmt == "clustal": write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT, score_matrix) else: raise DataError("unknown output format: '{0}'".format(outfmt)) # Dump pickled alignment object if user asked for it. if args.dump_alignment is not None: with open(args.dump_alignment, 'wb') as fo: pickle.dump(alignment, fo) if args.dump_all_tracks is not None: try: os.mkdir(args.dump_all_tracks) except OSError: pass all_trids = [] for trid, track in alignment.items[0].tracks: if track.tid == PlainTrack.tid: all_trids.append(trid) for trid in all_trids: filename = "dump-{0}.aln".format(trid) path = os.path.join(args.dump_all_tracks, filename) if outfmt == "fasta": write_alignment_fasta(path, alignment, trid) elif outfmt == "clustal": write_alignment_clustal(path, alignment, trid, None) else: raise DataError("unknown output format: '{0}'".format(outfmt)) if verbose: sys.stdout.write('\n') # Collect log bundles if args.debug > 0: write_log_structure(root_node)
def main(): # Parse arguments. args = parse_args() verbose = not args.quiet or args.verbose # Setup the execution manager. index = TypeIndex() index.autoregister() if args.remote: if args.remote_secret is None: secret = "__MUCH_SECRITY__" else: with open(args.remote_secret, 'r') as f: secret = f.readline() manager = RemoteManager(index, args.remote_host, args.remote_port, secret) elif args.num_threads > 1: manager = ParallelExecutionManager(index, args.num_threads - 1) else: manager = Manager(index) # Register manager cleanup code at exit. atexit.register(_atexit_close_manager, manager=manager) # Load inputs and other data. with open_resource(args.score_matrix, "matrices") as f: score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA) seqs = load_sequence_fasta(args.input, ALPHABET_AA) gap_series = [-float(x) for x in args.gap_penalties.split(",")] # Setup environment. keys = {} keys['gap_series'] = gap_series keys['db_name'] = args.psi_blast_db keys['num_seqs'] = args.psi_blast_num keys['max_evalue'] = args.psi_blast_evalue keys['profile_evalue'] = args.psi_blast_inclusion keys['num_iterations'] = args.psi_blast_iters keys['score_threshold'] = args.preprofile_score keys['linkage_method'] = args.tree_linkage keys['waterman_eggert_iterations'] = args.num_preprofile_alignments keys['aligner'] = PairwiseAligner.tid keys['debug'] = args.debug if args.merge_semiglobal_auto: keys['merge_mode'] = 'semiglobal_auto' elif args.merge_semiglobal: keys['merge_mode'] = 'semiglobal' else: keys['merge_mode'] = 'global' if args.dist_semiglobal_auto: keys['dist_mode'] = 'semiglobal_auto' elif args.dist_semiglobal: keys['dist_mode'] = 'semiglobal' else: keys['dist_mode'] = 'global' if args.pregen_tree: keys['msa_mode'] = 'tree' else: keys['msa_mode'] = 'ad_hoc' if args.preprofile_global: keys['preprofile_mode'] = 'global' elif args.preprofile_local: keys['preprofile_mode'] = 'local' else: keys['preprofile_mode'] = 'dummy' if args.psi_blast: keys['run_psi_blast'] = True if args.no_accelerate: keys['accelerate'] = False else: keys['accelerate'] = True try: keys['blast_plus_root'] = os.environ['BLAST_PLUS_ROOT'] except KeyError: pass env = Environment(keys=keys) # Initialize root node for output root_node = TaskNode(ROOT_TAG) # Run the PRALINE MSA workflow component = PralineMultipleSequenceAlignmentWorkflow execution = Execution(manager, ROOT_TAG) task = execution.add_task(component) task.inputs(sequences=seqs, score_matrix=score_matrix) task.environment(env) outputs = run(execution, verbose, root_node)[0] alignment = outputs['alignment'] # Write alignment to output file. outfmt = args.output_format if outfmt == 'fasta': write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT) elif outfmt == "clustal": write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT, score_matrix) else: raise DataError("unknown output format: '{0}'".format(outfmt)) if verbose: sys.stdout.write('\n') # Collect log bundles if args.debug > 0: write_log_structure(root_node)
def create_msa_input(job, args, manager, root_node): verbose = False alphabet = ALPHABET_DNA seqs = load_sequence_fasta(args.input, alphabet) # Load inputs and other data. if args.score_matrix is not None: score_matrix_file = args.score_matrix else: score_matrix_file = 'nucleotide' # Read score parameters. with open_resource(score_matrix_file, "matrices") as f: score_matrices = [load_score_matrix(f, alphabet=alphabet)] gap_series = [-float(x) for x in args.gap_penalties.split(",")] # Setup environment. keys = {} keys['gap_series'] = gap_series keys['debug'] = args.debug keys['merge_mode'] = 'global' keys['dist_mode'] = 'global' keys['accelerate'] = True env = Environment(keys=keys) # Initialize root node for output root_node = TaskNode(ROOT_TAG) # Annotate the motifs from the files and patterns. track_scores = do_motif_annotation(args, env, manager, seqs, verbose, root_node) # Build score matrices. motif_score_matrices = {} for trid, score in track_scores.iteritems(): if score is None: score = args.motif_match_score motif_score_matrices[trid] = get_motif_score_matrix(score, args.score_spacers) # Add all the new annotation tracks to the list of tracks to use # in the alignment. track_id_sets = [[TRACK_ID_INPUT]] for trid, track in seqs[0].tracks: if trid in motif_score_matrices: track_id_sets.append([trid]) score_matrices.append(motif_score_matrices[trid]) # Build initial sets of which sequences to align against every master # sequence. By default, we want to align every input sequence against # every other input sequence. master_slave_seqs = [] all_seqs = list(seqs) for master_seq in seqs: slave_seqs = [] for slave_seq in seqs: if slave_seq is not master_seq: slave_seqs.append(slave_seq) master_slave_seqs.append((master_seq, slave_seqs)) master_slave_alignments = do_master_slave_alignments(args, env, manager, master_slave_seqs, track_id_sets, score_matrices, verbose, root_node) # Build preprofiles from master-slave alignments. do_preprofiles(args, env, manager, master_slave_alignments, seqs, verbose, root_node) msa_track_id_sets = _replace_input_track_id(track_id_sets) return env, seqs, msa_track_id_sets, score_matrices