Beispiel #1
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
Beispiel #2
0
def _run_antismash(sequence_file: Optional[str], options: ConfigType) -> int:
    """ The real run_antismash, assumes logging is set up around it """
    logging.info("antiSMASH version: %s", options.version)
    _log_found_executables(options)

    detection_modules = get_detection_modules()
    analysis_modules = get_analysis_modules()
    output_modules = get_output_modules()
    modules = detection_modules + analysis_modules + output_modules

    if options.list_plugins:
        list_plugins(modules)
        return 0

    options.all_enabled_modules = list(filter(lambda x: x.is_enabled(options), modules))

    if options.check_prereqs_only:
        try:
            check_prerequisites(modules, options)
        except RuntimeError:
            print("Some module prerequisites not satisfied")
            return 1
        print("All prerequisites satisfied")
        return 0
    else:
        check_prerequisites(options.all_enabled_modules, options)

    # start up profiling if relevant
    if options.profile:
        profiler = cProfile.Profile()
        profiler.enable()

    # ensure the provided options are valid
    if not verify_options(options, options.all_enabled_modules):
        return 1

    # check that at least one module will run
    if not options.all_enabled_modules:
        raise ValueError("No detection or analysis modules enabled")

    start_time = datetime.now()

    results = read_data(sequence_file, options)

    # reset module timings
    results.timings_by_record.clear()

    prepare_output_directory(options.output_dir, sequence_file or options.reuse_results)

    results.records = record_processing.pre_process_sequences(results.records, options,
                                                              cast(AntismashModule, genefinding))
    for record, module_results in zip(results.records, results.results):
        # skip if we're not interested in it
        if record.skip:
            continue
        logging.info("Analysing record: %s", record.id)
        timings = run_detection(record, options, module_results)
        # and skip analysis if detection didn't find anything
        if not record.get_regions():
            continue
        analysis_timings = analyse_record(record, options, analysis_modules, module_results)
        timings.update(analysis_timings)
        results.timings_by_record[record.id] = timings

    # Write results
    json_filename = os.path.join(options.output_dir, results.input_file)
    json_filename = os.path.splitext(json_filename)[0] + ".json"
    logging.debug("Writing json results to '%s'", json_filename)
    results.write_to_file(json_filename)

    # now that the json is out of the way, annotate the record
    # otherwise we could double annotate some areas
    annotate_records(results)

    # create relevant output files
    write_outputs(results, options)

    # save profiling data
    if options.profile:
        profiler.disable()
        write_profiling_results(profiler, os.path.join(options.output_dir,
                                                       "profiling_results"))

    running_time = datetime.now() - start_time

    # display module runtimes before total time
    if options.debug:
        log_module_runtimes(results.timings_by_record)

    logging.debug("antiSMASH calculation finished at %s; runtime: %s",
                  datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str(running_time))

    logging.info("antiSMASH status: SUCCESS")
    return 0
Beispiel #3
0
def run_antismash(sequence_file: Optional[str], options: ConfigType) -> int:
    """ The complete antismash pipeline. Reads in data, runs detection and
        analysis modules over any records found, then outputs the results to
        file.

        Arguments:
            sequence_file: the sequence file to read in records from, can be
                            None if reusing results
            options: command line options
            detection_modules: None or a list of modules to use for detection,
                                if None defaults will be used
            analysis_modules: None or a list of modules to use for analysis,
                                if None defaults will be used

        Returns:
            0 if requested operations completed succesfully, otherwise 1
            Exceptions may also be raised
    """
    setup_logging(logfile=options.logfile,
                  verbose=options.verbose,
                  debug=options.debug)

    detection_modules = get_detection_modules()
    analysis_modules = get_analysis_modules()
    modules = detection_modules + analysis_modules

    if options.list_plugins:
        list_plugins(modules)
        return 0

    options.all_enabled_modules = list(
        filter(lambda x: x.is_enabled(options), modules))

    if options.check_prereqs_only:
        try:
            check_prerequisites(modules)
        except RuntimeError:
            print("Some module prerequisites not satisfied")
            return 1
        print("All prerequisites satisfied")
        return 0
    else:
        check_prerequisites(options.all_enabled_modules)

    # start up profiling if relevant
    if options.profile:
        profiler = cProfile.Profile()
        profiler.enable()

    # ensure the provided options are valid
    if not verify_options(options, options.all_enabled_modules):
        return 1  # TODO: change to a raise?

    # check that at least one module will run
    if not options.all_enabled_modules:
        raise ValueError("No detection or analysis modules enabled")

    start_time = datetime.now()

    results = read_data(sequence_file, options)

    # reset module timings
    results.timings_by_record.clear()

    prepare_output_directory(options.output_dir, sequence_file
                             or options.reuse_results)

    results.records = record_processing.pre_process_sequences(
        results.records, options, cast(AntismashModule, genefinding))
    for record, module_results in zip(results.records, results.results):
        # skip if we're not interested in it
        if record.skip:
            continue
        timings = run_detection(record, options, module_results)
        # and skip analysis if detection didn't find anything
        if not record.get_clusters():
            continue
        analysis_timings = analyse_record(record, options, analysis_modules,
                                          module_results)
        timings.update(analysis_timings)
        results.timings_by_record[record.id] = timings

    # Write results
    json_filename = os.path.join(options.output_dir, results.input_file)
    json_filename = os.path.splitext(json_filename)[0] + ".json"
    logging.debug("Writing json results to '%s'", json_filename)
    results.write_to_file(json_filename)

    # now that the json is out of the way, annotate the record
    # otherwise we could double annotate some areas
    annotate_records(results)

    # create relevant output files
    write_outputs(results, options)

    # save profiling data
    if options.profile:
        profiler.disable()
        write_profiling_results(
            profiler, os.path.join(options.output_dir, "profiling_results"))

    running_time = datetime.now() - start_time

    # display module runtimes before total time
    if options.debug:
        log_module_runtimes(results.timings_by_record)

    logging.debug("antiSMASH calculation finished at %s; runtime: %s",
                  str(datetime.now()), str(running_time))

    logging.info("antiSMASH status: SUCCESS")
    return 0