def test_unit_train_classify(tmpdir): tmpdir = str(tmpdir) out_path = os.path.join(tmpdir, 'model.pkl') run([ 'train', '--model', get_test_file('random_forest_test.json'), '--classes', get_test_file('BGC0000015.classes.csv'), '--output', out_path, get_test_file('BGC0000015.pfam.csv') ]) assert os.path.exists(out_path) model = SequenceModelWrapper.load(out_path) domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv')) classes = model.predict( [sample for _, sample in domains.groupby('sequence_id')]) assert isinstance(classes, pd.DataFrame) assert list(classes.columns) == ['class1', 'class2', 'class3', 'class4'] assert len(classes.index) == 2 assert list(classes.iloc[0] > 0.5) == [True, False, True, False] assert list(classes.iloc[1] > 0.5) == [False, True, False, True]
def test_unit_train_detect(model, tmpdir): tmpdir = str(tmpdir) out_path = os.path.join(tmpdir, 'model.pkl') run([ 'train', '--model', get_test_file(model), '--config', 'PFAM2VEC', get_test_file('pfam2vec.test.tsv'), '--output', out_path, get_test_file('BGC0000015.pfam.csv'), get_test_file('negative.pfam.csv') ]) assert os.path.exists(out_path) model = SequenceModelWrapper.load(out_path) pos_domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv')) neg_domains = pd.read_csv(get_test_file('negative.pfam.csv')) pos_prediction = model.predict(pos_domains) neg_prediction = model.predict(neg_domains) assert isinstance(pos_prediction, pd.Series) assert isinstance(neg_prediction, pd.Series) assert pos_prediction.index.equals(pos_domains.index) assert neg_prediction.index.equals(neg_domains.index) assert pos_prediction.mean() > 0.5 assert neg_prediction.mean() < 0.5
def test_integration_train_detect_fail_fasta(): # Should fail due to unprocessed input sequence with pytest.raises(NotImplementedError): run([ 'train', '--model', get_test_file('clusterfinder_geneborder_test.json'), '--output', 'bar.pkl', get_test_file('BGC0000015.fa') ])
def test_integration_pipeline_labelled(tmpdir): tmpdir = str(tmpdir) report_dir = os.path.join(tmpdir, 'report') run(['pipeline', '--output', report_dir, get_test_file('labelled.gbk')]) evaluation_dir = os.path.join(report_dir, 'evaluation') files = os.listdir(evaluation_dir) for file in files: print(file) assert 'report.bgc.png' in files assert 'report.score.png' in files assert 'report.roc.png' in files assert 'report.pr.png' in files
def test_integration_prepare_default(tmpdir): tmpdir = str(tmpdir) outgbk = os.path.join(tmpdir, 'outfile.gbk') outtsv = os.path.join(tmpdir, 'outfile.tsv') run([ 'prepare', '--output-gbk', outgbk, '--output-tsv', outtsv, get_test_file('BGC0000015.fa') ]) records = list(SeqIO.parse(outgbk, 'genbank')) assert len(records) == 2 record = records[0] assert_sorted_features(record) proteins = util.get_protein_features(record) pfams = util.get_pfam_features(record) assert len(proteins) == 18 print([util.get_protein_id(f) for f in proteins]) assert len(pfams) == 111 record = records[1] assert_sorted_features(record) proteins = util.get_protein_features(record) pfams = util.get_pfam_features(record) assert len(proteins) == 27 assert len(pfams) == 36 domains = pd.read_csv(outtsv, sep='\t') records = domains.groupby('sequence_id') assert len(records) == 2 record = records.get_group('BGC0000015.1') print(record['protein_id'].unique()) # some of the proteins do not have any Pfam domains so they are not present assert len(record['protein_id'].unique()) == 17 assert len(record) == 111 record = records.get_group('BGC0000015.2') # some of the proteins do not have any Pfam domains so they are not present assert len(record['protein_id'].unique()) == 11 assert len(record) == 36
def test_integration_pipeline_default(tmpdir, input_file): tmpdir = str(tmpdir) report_dir = os.path.join(tmpdir, 'report') run(['pipeline', '--output', report_dir, get_test_file(input_file)]) files = os.listdir(report_dir) for file in files: print(file) assert 'README.txt' in files assert 'report.bgc.gbk' in files assert 'report.bgc.tsv' in files assert 'report.full.gbk' in files assert 'report.pfam.tsv' in files evaluation_dir = os.path.join(report_dir, 'evaluation') files = os.listdir(evaluation_dir) for file in files: print(file) assert 'report.bgc.png' in files assert 'report.score.png' in files records = list( SeqIO.parse(os.path.join(report_dir, 'report.full.gbk'), 'genbank')) assert len(records) == 2 record = records[0] cluster_features = util.get_cluster_features(record) assert len(cluster_features) >= 1 record = records[1] cluster_features = util.get_cluster_features(record) assert len(cluster_features) >= 1 cluster_records = list( SeqIO.parse(os.path.join(report_dir, 'report.bgc.gbk'), 'genbank')) assert len(cluster_records) >= 2
def test_unit_pipeline_default(tmpdir, mocker): tmpdir = str(tmpdir) mocker.patch('os.mkdir') mocker.patch('deepbgc.command.pipeline.logging.FileHandler') mock_seqio = mocker.patch( 'deepbgc.command.pipeline.deepbgc.util.SequenceParser') record1 = SeqRecord('ABC') record2 = SeqRecord('DEF') mock_seqio_instance = mock_seqio.return_value mock_seqio_instance.__enter__.return_value = mock_seqio_instance mock_seqio_instance.parse.return_value = [record1, record2] mock_annotator = mocker.patch('deepbgc.command.pipeline.DeepBGCAnnotator') mock_classifier = mocker.patch( 'deepbgc.command.pipeline.DeepBGCClassifier') mock_detector = mocker.patch('deepbgc.command.pipeline.DeepBGCDetector') writer_paths = [ 'deepbgc.command.pipeline.BGCRegionPlotWriter', 'deepbgc.command.pipeline.ClusterTSVWriter', 'deepbgc.command.pipeline.PfamScorePlotWriter', 'deepbgc.command.pipeline.PfamTSVWriter', 'deepbgc.command.pipeline.GenbankWriter', 'deepbgc.command.pipeline.BGCGenbankWriter', 'deepbgc.command.pipeline.ReadmeWriter' # Note: We are mocking classes imported in deepbgc.command.pipeline, not at their original location! ] writers = [mocker.patch(path) for path in writer_paths] report_dir = os.path.join(tmpdir, 'report') report_tmp_dir = os.path.join(report_dir, 'tmp') run([ 'pipeline', '--output', report_dir, '--detector', 'mydetector', '--label', 'mylabel', '--score', '0.1', '--merge-max-protein-gap', '8', '--merge-max-nucl-gap', '9', '--min-nucl', '10', '--min-proteins', '20', '--min-domains', '30', '--min-bio-domains', '40', '--classifier', 'myclassifier1', '--classifier', 'myclassifier2', '--classifier-score', '0.2', 'mySequence.gbk' ]) # Remove logging handlers to avoid affecting other tests logger = logging.getLogger('') for handler in logger.handlers[:]: logger.removeHandler(handler) os.mkdir.assert_any_call(report_dir) os.mkdir.assert_any_call(report_tmp_dir) mock_annotator.assert_called_with(tmp_dir_path=report_tmp_dir, prodigal_meta_mode=False) mock_classifier.assert_any_call(classifier='myclassifier1', score_threshold=0.2) mock_classifier.assert_any_call(classifier='myclassifier2', score_threshold=0.2) mock_detector.assert_called_with(detector='mydetector', label='mylabel', score_threshold=0.1, merge_max_protein_gap=8, merge_max_nucl_gap=9, min_nucl=10, min_proteins=20, min_domains=30, min_bio_domains=40) assert mock_annotator.return_value.run.call_count == 2 # Two records assert mock_detector.return_value.run.call_count == 2 # Two records assert mock_classifier.return_value.run.call_count == 4 # Two records for each of the two classifiers mock_annotator.return_value.print_summary.assert_called_once_with() mock_detector.return_value.print_summary.assert_called_once_with() assert mock_classifier.return_value.print_summary.call_count == 2 # For each of the two classifiers for writer in writers: assert writer.return_value.write.call_count == 2 # Two records writer.return_value.close.assert_called_once_with()
def test_unit_main_help(): with pytest.raises(SystemExit) as excinfo: run(['--help']) assert excinfo.value.code == 0
def test_unit_main_invalid_command(): with pytest.raises(SystemExit) as excinfo: run(['invalid']) assert excinfo.value.code == 2
def test_unit_prepare_help(): with pytest.raises(SystemExit) as excinfo: run(['prepare', '--help']) assert excinfo.value.code == 0