Exemple #1
0
    def execute(self, sequence, track_id):
        track = sequence.get_track(track_id)
        if track.alphabet.aid == ALPHABET_AA.aid:
            program = "blastp"
        elif track.alphabet.aid == ALPHABET_DNA.aid:
            program = "blastn"
        else:
            s = "BLAST+ will only accept inputs with an AA " \
                "or DNA alphabet"
            raise ComponentError(s)

        if self.environment['blast_plus_root'] is None:
            s = "need to know where you have installed BLAST+, " \
                "supply in shell variable BLAST_PLUS_ROOT or " \
                "PRALINE environment option blast_plus_root"
            raise ComponentError(s)

        blast_root = self.environment['blast_plus_root']
        program_path = os.path.join(blast_root, 'bin', program)
        db_program_path = os.path.join(blast_root, 'bin', 'blastdbcmd')

        temp_root = tempfile.mkdtemp()
        blast_output_path = os.path.join(temp_root, 'blast.out')
        input_fasta_path = os.path.join(temp_root, 'input.fasta')
        output_fasta_path = os.path.join(temp_root, 'output.fasta')
        entries_path = os.path.join(temp_root, 'entries.txt')

        db_name = self.environment['db_name']
        num_seqs = self.environment['num_seqs']
        max_evalue = self.environment['max_evalue']

        try:
            write_sequence_fasta(input_fasta_path, [sequence], track_id)
            with open(blast_output_path, 'w') as fo:
                args = [
                    program_path, "-query", input_fasta_path, "-db", db_name,
                    "-outfmt", "5"
                ]
                subprocess.check_call(args, stdout=fo)

            with open(blast_output_path) as fi, open(entries_path, 'w') as fo:
                for i, (id_, evalue) in enumerate(_parse_xml(fi)):
                    if i >= num_seqs:
                        break
                    if evalue <= max_evalue:
                        fo.write("{0}\n".format(id_))

            with open(output_fasta_path, 'w') as fo:
                args = [
                    db_program_path, "-db", db_name, "-entry_batch",
                    entries_path, "-target_only"
                ]
                subprocess.check_call(args, stdout=fo)

            seqs = load_sequence_fasta(output_fasta_path, track.alphabet)
        finally:
            shutil.rmtree(temp_root)

        yield CompleteMessage(outputs={'sequences': seqs})
Exemple #2
0
def do_motif_annotation(args, env, manager, seqs, verbose, root_node):
    FMT_TRACK_ID = "{0}_{1}"

    track_scores = {}

    execution = Execution(manager, ROOT_TAG)
    seq_patterns = []
    for seq in seqs:
        for pair in args.patterns:
            pattern = pair[0]
            if len(pair) > 1:
                score = float(pair[1])
            else:
                score = None
            seq_patterns.append((seq, pattern, score))

            component = PrositePatternAnnotator
            task = execution.add_task(component)
            task.environment(env)
            task.inputs(sequence=seq, pattern=pattern,
                        track_id=TRACK_ID_INPUT)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        seq, pattern, score = seq_patterns[n]

        track = output['prediction_track']

        trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern)
        seq.add_track(trid, track)
        track_scores[trid] = score

    for pair in args.annotation_files:
        annotation_file = pair[0]
        if len(pair) > 1:
            score = float(pair[1])
        else:
            score = None

        annotation_seqs = load_sequence_fasta(annotation_file,
                                              ALPHABET_PROSITE)
        name_tracks = {}
        for annotation_seq in annotation_seqs:
            track = annotation_seq.get_track(TRACK_ID_INPUT)
            name_tracks[annotation_seq.name] = track

        for seq in seqs:
            track = name_tracks[seq.name]

            trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file)
            seq.add_track(trid, track)
            track_scores[trid] = score

    return track_scores
def do_motif_annotation(args, env, manager, seqs, verbose, root_node):
    FMT_TRACK_ID = "{0}_{1}"

    track_scores = {}

    execution = Execution(manager, ROOT_TAG)
    seq_patterns = []
    for seq in seqs:
        for pair in args.patterns:
            pattern = pair[0]
            if len(pair) > 1:
                score = float(pair[1])
            else:
                score = None
            seq_patterns.append((seq, pattern, score))

            component = PrositePatternAnnotator
            task = execution.add_task(component)
            task.environment(env)
            task.inputs(sequence=seq, pattern=pattern,
                        track_id=TRACK_ID_INPUT)

    outputs = run(execution, verbose=verbose, root_node=root_node)
    for n, output in enumerate(outputs):
        seq, pattern, score = seq_patterns[n]

        track = output['prediction_track']

        trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_PATTERN, pattern)
        seq.add_track(trid, track)
        track_scores[trid] = score

    for pair in args.annotation_files:
        annotation_file = pair[0]
        if len(pair) > 1:
            score = float(pair[1])
        else:
            score = None

        annotation_seqs = load_sequence_fasta(annotation_file,
                                              ALPHABET_PROSITE)
        name_tracks = {}
        for annotation_seq in annotation_seqs:
            track = annotation_seq.get_track(TRACK_ID_INPUT)
            name_tracks[annotation_seq.name] = track

        for seq in seqs:
            track = name_tracks[seq.name]

            trid = FMT_TRACK_ID.format(_TRACK_ID_BASE_FILE, annotation_file)
            seq.add_track(trid, track)
            track_scores[trid] = score

    return track_scores
Exemple #4
0
def create_msa_input(job, args, manager, root_node):
    verbose = False
    alphabet = ALPHABET_DNA

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['debug'] = args.debug
    keys['merge_mode'] = 'global'
    keys['dist_mode'] = 'global'
    keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs, verbose,
                                       root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in track_scores.iteritems():
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(
            score, args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(
        args, env, manager, master_slave_seqs, track_id_sets, score_matrices,
        verbose, root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs, verbose,
                   root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    return env, seqs, msa_track_id_sets, score_matrices
Exemple #5
0
def main():
    # Parse arguments.
    args = parse_args()
    verbose = not args.quiet or args.verbose

    # See if we're doing DNA or protein alignments.
    # TODO: if unspecified, autodetect this based on input file contents?
    alphabet = None
    if args.input_dna:
        alphabet = ALPHABET_DNA
    elif args.input_protein:
        alphabet = ALPHABET_AA

    # Setup the execution manager.
    index = TypeIndex()
    index.autoregister()
    if args.num_threads > 1:
        manager = ParallelExecutionManager(index, args.num_threads)
    else:
        manager = Manager(index)

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        if alphabet == ALPHABET_AA:
            score_matrix_file = 'blosum62'
        elif alphabet == ALPHABET_DNA:
            score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['score_threshold'] = args.preprofile_score
    keys['linkage_method'] = args.tree_linkage
    keys['waterman_eggert_iterations'] = args.num_preprofile_alignments
    keys['debug'] = args.debug
    if args.merge_semiglobal:
        keys['merge_mode'] = 'semiglobal'
        keys['dist_mode'] = 'semiglobal'
    else:
        keys['merge_mode'] = 'global'
        keys['dist_mode'] = 'global'

    if args.no_accelerate:
        keys['accelerate'] = False
    else:
        keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs,
                                             verbose, root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in six.iteritems(track_scores):
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(score,
                                                            args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(args, env,
                                                         manager,
                                                         master_slave_seqs,
                                                         track_id_sets,
                                                         score_matrices,
                                                         verbose,
                                                         root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs,
                   verbose, root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    # Do multiple sequence alignment from preprofile-annotated sequences.
    alignment = do_multiple_sequence_alignment(args, env, manager, seqs,
                                               msa_track_id_sets, score_matrices,
                                               verbose, root_node)

    # Write alignment to output file.
    outfmt = args.output_format
    if outfmt == 'fasta':
        write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT)
    elif outfmt == "clustal":
        write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT,
                                score_matrix)
    else:
        raise DataError("unknown output format: '{0}'".format(outfmt))

    # Dump pickled alignment object if user asked for it.
    if args.dump_alignment is not None:
        with open(args.dump_alignment, 'wb') as fo:
            pickle.dump(alignment, fo)

    if args.dump_all_tracks is not None:
        try:
            os.mkdir(args.dump_all_tracks)
        except OSError:
            pass

        all_trids = []
        for trid, track in alignment.items[0].tracks:
            if track.tid == PlainTrack.tid:
                all_trids.append(trid)

        for trid in all_trids:
            filename = "dump-{0}.aln".format(trid)
            path = os.path.join(args.dump_all_tracks, filename)

            if outfmt == "fasta":
                write_alignment_fasta(path, alignment, trid)
            elif outfmt == "clustal":
                write_alignment_clustal(path, alignment, trid, None)
            else:
                raise DataError("unknown output format: '{0}'".format(outfmt))

    if verbose:
        sys.stdout.write('\n')

    # Collect log bundles
    if args.debug > 0:
        write_log_structure(root_node)
Exemple #6
0
def main():
    # Parse arguments.
    args = parse_args()
    verbose = not args.quiet or args.verbose

    # Setup the execution manager.
    index = TypeIndex()
    index.autoregister()
    if args.remote:
        if args.remote_secret is None:
            secret = "__MUCH_SECRITY__"
        else:
            with open(args.remote_secret, 'r') as f:
                secret = f.readline()

        manager = RemoteManager(index, args.remote_host, args.remote_port,
                                secret)
    elif args.num_threads > 1:
        manager = ParallelExecutionManager(index, args.num_threads - 1)
    else:
        manager = Manager(index)

    # Register manager cleanup code at exit.
    atexit.register(_atexit_close_manager, manager=manager)

    # Load inputs and other data.
    with open_resource(args.score_matrix, "matrices") as f:
        score_matrix = load_score_matrix(f, alphabet=ALPHABET_AA)
    seqs = load_sequence_fasta(args.input, ALPHABET_AA)
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['db_name'] = args.psi_blast_db
    keys['num_seqs'] = args.psi_blast_num
    keys['max_evalue'] = args.psi_blast_evalue
    keys['profile_evalue'] = args.psi_blast_inclusion
    keys['num_iterations'] = args.psi_blast_iters
    keys['score_threshold'] = args.preprofile_score
    keys['linkage_method'] = args.tree_linkage
    keys['waterman_eggert_iterations'] = args.num_preprofile_alignments
    keys['aligner'] = PairwiseAligner.tid
    keys['debug'] = args.debug
    if args.merge_semiglobal_auto:
        keys['merge_mode'] = 'semiglobal_auto'
    elif args.merge_semiglobal:
        keys['merge_mode'] = 'semiglobal'
    else:
        keys['merge_mode'] = 'global'

    if args.dist_semiglobal_auto:
        keys['dist_mode'] = 'semiglobal_auto'
    elif args.dist_semiglobal:
        keys['dist_mode'] = 'semiglobal'
    else:
        keys['dist_mode'] = 'global'

    if args.pregen_tree:
        keys['msa_mode'] = 'tree'
    else:
        keys['msa_mode'] = 'ad_hoc'

    if args.preprofile_global:
        keys['preprofile_mode'] = 'global'
    elif args.preprofile_local:
        keys['preprofile_mode'] = 'local'
    else:
        keys['preprofile_mode'] = 'dummy'

    if args.psi_blast:
        keys['run_psi_blast'] = True

    if args.no_accelerate:
        keys['accelerate'] = False
    else:
        keys['accelerate'] = True

    try:
        keys['blast_plus_root'] = os.environ['BLAST_PLUS_ROOT']
    except KeyError:
        pass

    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Run the PRALINE MSA workflow
    component = PralineMultipleSequenceAlignmentWorkflow
    execution = Execution(manager, ROOT_TAG)
    task = execution.add_task(component)
    task.inputs(sequences=seqs, score_matrix=score_matrix)
    task.environment(env)

    outputs = run(execution, verbose, root_node)[0]
    alignment = outputs['alignment']

    # Write alignment to output file.
    outfmt = args.output_format
    if outfmt == 'fasta':
        write_alignment_fasta(args.output, alignment, TRACK_ID_INPUT)
    elif outfmt == "clustal":
        write_alignment_clustal(args.output, alignment, TRACK_ID_INPUT,
                                score_matrix)
    else:
        raise DataError("unknown output format: '{0}'".format(outfmt))

    if verbose:
        sys.stdout.write('\n')

    # Collect log bundles
    if args.debug > 0:
        write_log_structure(root_node)
def create_msa_input(job, args, manager, root_node):
    verbose = False
    alphabet = ALPHABET_DNA

    seqs = load_sequence_fasta(args.input, alphabet)

    # Load inputs and other data.
    if args.score_matrix is not None:
        score_matrix_file = args.score_matrix
    else:
        score_matrix_file = 'nucleotide'

    # Read score parameters.
    with open_resource(score_matrix_file, "matrices") as f:
        score_matrices = [load_score_matrix(f, alphabet=alphabet)]
    gap_series = [-float(x) for x in args.gap_penalties.split(",")]

    # Setup environment.
    keys = {}
    keys['gap_series'] = gap_series
    keys['debug'] = args.debug
    keys['merge_mode'] = 'global'
    keys['dist_mode'] = 'global'
    keys['accelerate'] = True
    env = Environment(keys=keys)

    # Initialize root node for output
    root_node = TaskNode(ROOT_TAG)

    # Annotate the motifs from the files and patterns.
    track_scores = do_motif_annotation(args, env, manager, seqs,
                                             verbose, root_node)
    # Build score matrices.
    motif_score_matrices = {}
    for trid, score in track_scores.iteritems():
        if score is None:
            score = args.motif_match_score

        motif_score_matrices[trid] = get_motif_score_matrix(score,
                                                            args.score_spacers)

    # Add all the new annotation tracks to the list of tracks to use
    # in the alignment.
    track_id_sets = [[TRACK_ID_INPUT]]
    for trid, track in seqs[0].tracks:
        if trid in motif_score_matrices:
            track_id_sets.append([trid])
            score_matrices.append(motif_score_matrices[trid])

    # Build initial sets of which sequences to align against every master
    # sequence. By default, we want to align every input sequence against
    # every other input sequence.
    master_slave_seqs = []
    all_seqs = list(seqs)
    for master_seq in seqs:
        slave_seqs = []
        for slave_seq in seqs:
            if slave_seq is not master_seq:
                slave_seqs.append(slave_seq)
        master_slave_seqs.append((master_seq, slave_seqs))

    master_slave_alignments = do_master_slave_alignments(args, env,
                                                         manager,
                                                         master_slave_seqs,
                                                         track_id_sets,
                                                         score_matrices,
                                                         verbose,
                                                         root_node)

    # Build preprofiles from master-slave alignments.
    do_preprofiles(args, env, manager, master_slave_alignments, seqs,
                   verbose, root_node)
    msa_track_id_sets = _replace_input_track_id(track_id_sets)

    return env, seqs, msa_track_id_sets, score_matrices