def run(self, inputs, limit_to_record, output_gbk, output_tsv, prodigal_meta_mode, protein): first_output = output_gbk or output_tsv if not first_output: raise ValueError( 'Specify at least one of --output-gbk or --output-tsv') tmp_dir_path = first_output + '.tmp' logging.debug('Using TMP dir: %s', tmp_dir_path) if not os.path.exists(tmp_dir_path): os.mkdir(tmp_dir_path) prepare_step = DeepBGCAnnotator(tmp_dir_path=tmp_dir_path, prodigal_meta_mode=prodigal_meta_mode) writers = [] if output_gbk: writers.append(GenbankWriter(out_path=output_gbk)) if output_tsv: writers.append(PfamTSVWriter(out_path=output_tsv)) num_records = 0 for i, input_path in enumerate(inputs): logging.info('Processing input file %s/%s: %s', i + 1, len(inputs), input_path) with util.SequenceParser(input_path, protein=protein) as parser: for record in parser.parse(): if limit_to_record and record.id not in limit_to_record: logging.debug( 'Skipping record %s not matching filter %s', record.id, limit_to_record) continue prepare_step.run(record) for writer in writers: writer.write(record) num_records += 1 logging.debug('Removing TMP directory: %s', tmp_dir_path) shutil.rmtree(tmp_dir_path) prepare_step.print_summary() for writer in writers: writer.close() logging.info('Saved %s fully annotated records to %s', num_records, first_output)
def run(self, inputs, output_gbk, output_tsv): first_output = output_gbk or output_tsv if not first_output: raise ValueError( 'Specify at least one of --output-gbk or --output-tsv') tmp_dir_path = first_output + '.tmp' logging.debug('Using TMP dir: %s', tmp_dir_path) if not os.path.exists(tmp_dir_path): os.mkdir(tmp_dir_path) prepare_step = DeepBGCAnnotator(tmp_dir_path=tmp_dir_path) writers = [] if output_gbk: writers.append(GenbankWriter(out_path=output_gbk)) if output_tsv: writers.append(PfamTSVWriter(out_path=output_tsv)) num_records = 0 for input_path in inputs: fmt = util.guess_format(input_path) if not fmt: raise NotImplementedError( "Sequence file type not recognized: {}, ".format( input_path), "Please provide a GenBank or FASTA sequence " "with an appropriate file extension.") records = SeqIO.parse(input_path, fmt) for record in records: prepare_step.run(record) for writer in writers: writer.write(record) num_records += 1 logging.debug('Removing TMP directory: %s', tmp_dir_path) shutil.rmtree(tmp_dir_path) prepare_step.print_summary() for writer in writers: writer.close() logging.info('Saved %s fully annotated records to %s', num_records, first_output)
def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier, is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl, min_proteins, min_domains, min_bio_domains, prodigal_meta_mode, protein): if not detectors: detectors = ['deepbgc'] if not classifiers: classifiers = ['product_class', 'product_activity'] if not output: # if not specified, set output path to name of first input file without extension output, _ = os.path.splitext( os.path.basename(os.path.normpath(inputs[0]))) if not os.path.exists(output): os.mkdir(output) # Save log to LOG.txt file logger = logging.getLogger('') logger.addHandler( logging.FileHandler(os.path.join(output, self.LOG_FILENAME))) # Define report dir paths tmp_path = os.path.join(output, self.TMP_DIRNAME) evaluation_path = os.path.join(output, self.PLOT_DIRNAME) output_file_name = os.path.basename(os.path.normpath(output)) steps = [] steps.append( DeepBGCAnnotator(tmp_dir_path=tmp_path, prodigal_meta_mode=prodigal_meta_mode)) if not no_detector: if not labels: labels = [None] * len(detectors) elif len(labels) != len(detectors): raise ValueError( 'A separate label should be provided for each of the detectors: {}' .format(detectors)) for detector, label in zip(detectors, labels): steps.append( DeepBGCDetector( detector=detector, label=label, score_threshold=score, merge_max_protein_gap=merge_max_protein_gap, merge_max_nucl_gap=merge_max_nucl_gap, min_nucl=min_nucl, min_proteins=min_proteins, min_domains=min_domains, min_bio_domains=min_bio_domains)) writers = [] writers.append( GenbankWriter(out_path=os.path.join(output, output_file_name + '.full.gbk'))) writers.append( AntismashJSONWriter( out_path=os.path.join(output, output_file_name + '.antismash.json'))) is_evaluation = False if not is_minimal_output: writers.append( BGCGenbankWriter( out_path=os.path.join(output, output_file_name + '.bgc.gbk'))) writers.append( ClusterTSVWriter( out_path=os.path.join(output, output_file_name + '.bgc.tsv'))) writers.append( PfamTSVWriter( out_path=os.path.join(output, output_file_name + '.pfam.tsv'))) is_evaluation = True writers.append( PfamScorePlotWriter( out_path=os.path.join(evaluation_path, output_file_name + '.score.png'))) writers.append( BGCRegionPlotWriter( out_path=os.path.join(evaluation_path, output_file_name + '.bgc.png'))) writers.append( ROCPlotWriter( out_path=os.path.join(evaluation_path, output_file_name + '.roc.png'))) writers.append( PrecisionRecallPlotWriter( out_path=os.path.join(evaluation_path, output_file_name + '.pr.png'))) writers.append( ReadmeWriter(out_path=os.path.join(output, 'README.txt'), root_path=output, writers=writers)) if not no_classifier: for classifier in classifiers: steps.append( DeepBGCClassifier(classifier=classifier, score_threshold=classifier_score)) # Create temp and evaluation dir if not os.path.exists(tmp_path): os.mkdir(tmp_path) if is_evaluation: if not os.path.exists(evaluation_path): os.mkdir(evaluation_path) record_idx = 0 for i, input_path in enumerate(inputs): logging.info('Processing input file %s/%s: %s', i + 1, len(inputs), input_path) with util.SequenceParser(input_path, protein=protein) as parser: for record in parser.parse(): if limit_to_record and record.id not in limit_to_record: logging.debug( 'Skipping record %s not matching filter %s', record.id, limit_to_record) continue record_idx += 1 logging.info('=' * 80) logging.info('Processing record #%s: %s', record_idx, record.id) for step in steps: step.run(record) logging.info('Saving processed record %s', record.id) for writer in writers: writer.write(record) logging.info('=' * 80) for step in steps: step.print_summary() for writer in writers: writer.close() logging.info('=' * 80) logging.info('Saved DeepBGC result to: {}'.format(output))