Ejemplo n.º 1
0
def test_unit_train_classify(tmpdir):
    tmpdir = str(tmpdir)
    out_path = os.path.join(tmpdir, 'model.pkl')
    run([
        'train', '--model',
        get_test_file('random_forest_test.json'), '--classes',
        get_test_file('BGC0000015.classes.csv'), '--output', out_path,
        get_test_file('BGC0000015.pfam.csv')
    ])

    assert os.path.exists(out_path)

    model = SequenceModelWrapper.load(out_path)

    domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv'))

    classes = model.predict(
        [sample for _, sample in domains.groupby('sequence_id')])

    assert isinstance(classes, pd.DataFrame)
    assert list(classes.columns) == ['class1', 'class2', 'class3', 'class4']

    assert len(classes.index) == 2

    assert list(classes.iloc[0] > 0.5) == [True, False, True, False]
    assert list(classes.iloc[1] > 0.5) == [False, True, False, True]
Ejemplo n.º 2
0
def test_unit_train_detect(model, tmpdir):
    tmpdir = str(tmpdir)
    out_path = os.path.join(tmpdir, 'model.pkl')
    run([
        'train', '--model',
        get_test_file(model), '--config', 'PFAM2VEC',
        get_test_file('pfam2vec.test.tsv'), '--output', out_path,
        get_test_file('BGC0000015.pfam.csv'),
        get_test_file('negative.pfam.csv')
    ])

    assert os.path.exists(out_path)

    model = SequenceModelWrapper.load(out_path)

    pos_domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv'))
    neg_domains = pd.read_csv(get_test_file('negative.pfam.csv'))

    pos_prediction = model.predict(pos_domains)
    neg_prediction = model.predict(neg_domains)

    assert isinstance(pos_prediction, pd.Series)
    assert isinstance(neg_prediction, pd.Series)

    assert pos_prediction.index.equals(pos_domains.index)
    assert neg_prediction.index.equals(neg_domains.index)

    assert pos_prediction.mean() > 0.5
    assert neg_prediction.mean() < 0.5
Ejemplo n.º 3
0
def test_integration_train_detect_fail_fasta():
    # Should fail due to unprocessed input sequence
    with pytest.raises(NotImplementedError):
        run([
            'train', '--model',
            get_test_file('clusterfinder_geneborder_test.json'), '--output',
            'bar.pkl',
            get_test_file('BGC0000015.fa')
        ])
Ejemplo n.º 4
0
def test_integration_pipeline_labelled(tmpdir):
    tmpdir = str(tmpdir)
    report_dir = os.path.join(tmpdir, 'report')
    run(['pipeline', '--output', report_dir, get_test_file('labelled.gbk')])

    evaluation_dir = os.path.join(report_dir, 'evaluation')
    files = os.listdir(evaluation_dir)
    for file in files:
        print(file)

    assert 'report.bgc.png' in files
    assert 'report.score.png' in files
    assert 'report.roc.png' in files
    assert 'report.pr.png' in files
def test_integration_prepare_default(tmpdir):
    tmpdir = str(tmpdir)
    outgbk = os.path.join(tmpdir, 'outfile.gbk')
    outtsv = os.path.join(tmpdir, 'outfile.tsv')
    run([
        'prepare', '--output-gbk', outgbk, '--output-tsv', outtsv,
        get_test_file('BGC0000015.fa')
    ])

    records = list(SeqIO.parse(outgbk, 'genbank'))

    assert len(records) == 2

    record = records[0]
    assert_sorted_features(record)
    proteins = util.get_protein_features(record)
    pfams = util.get_pfam_features(record)

    assert len(proteins) == 18
    print([util.get_protein_id(f) for f in proteins])
    assert len(pfams) == 111

    record = records[1]
    assert_sorted_features(record)
    proteins = util.get_protein_features(record)
    pfams = util.get_pfam_features(record)

    assert len(proteins) == 27
    assert len(pfams) == 36

    domains = pd.read_csv(outtsv, sep='\t')
    records = domains.groupby('sequence_id')

    assert len(records) == 2

    record = records.get_group('BGC0000015.1')
    print(record['protein_id'].unique())
    # some of the proteins do not have any Pfam domains so they are not present
    assert len(record['protein_id'].unique()) == 17
    assert len(record) == 111

    record = records.get_group('BGC0000015.2')
    # some of the proteins do not have any Pfam domains so they are not present
    assert len(record['protein_id'].unique()) == 11
    assert len(record) == 36
Ejemplo n.º 6
0
def test_integration_pipeline_default(tmpdir, input_file):
    tmpdir = str(tmpdir)
    report_dir = os.path.join(tmpdir, 'report')
    run(['pipeline', '--output', report_dir, get_test_file(input_file)])

    files = os.listdir(report_dir)
    for file in files:
        print(file)

    assert 'README.txt' in files
    assert 'report.bgc.gbk' in files
    assert 'report.bgc.tsv' in files
    assert 'report.full.gbk' in files
    assert 'report.pfam.tsv' in files

    evaluation_dir = os.path.join(report_dir, 'evaluation')
    files = os.listdir(evaluation_dir)
    for file in files:
        print(file)

    assert 'report.bgc.png' in files
    assert 'report.score.png' in files

    records = list(
        SeqIO.parse(os.path.join(report_dir, 'report.full.gbk'), 'genbank'))
    assert len(records) == 2

    record = records[0]
    cluster_features = util.get_cluster_features(record)
    assert len(cluster_features) >= 1

    record = records[1]
    cluster_features = util.get_cluster_features(record)
    assert len(cluster_features) >= 1

    cluster_records = list(
        SeqIO.parse(os.path.join(report_dir, 'report.bgc.gbk'), 'genbank'))
    assert len(cluster_records) >= 2
Ejemplo n.º 7
0
def test_unit_pipeline_default(tmpdir, mocker):
    tmpdir = str(tmpdir)
    mocker.patch('os.mkdir')
    mocker.patch('deepbgc.command.pipeline.logging.FileHandler')
    mock_seqio = mocker.patch(
        'deepbgc.command.pipeline.deepbgc.util.SequenceParser')

    record1 = SeqRecord('ABC')
    record2 = SeqRecord('DEF')
    mock_seqio_instance = mock_seqio.return_value
    mock_seqio_instance.__enter__.return_value = mock_seqio_instance
    mock_seqio_instance.parse.return_value = [record1, record2]

    mock_annotator = mocker.patch('deepbgc.command.pipeline.DeepBGCAnnotator')
    mock_classifier = mocker.patch(
        'deepbgc.command.pipeline.DeepBGCClassifier')
    mock_detector = mocker.patch('deepbgc.command.pipeline.DeepBGCDetector')

    writer_paths = [
        'deepbgc.command.pipeline.BGCRegionPlotWriter',
        'deepbgc.command.pipeline.ClusterTSVWriter',
        'deepbgc.command.pipeline.PfamScorePlotWriter',
        'deepbgc.command.pipeline.PfamTSVWriter',
        'deepbgc.command.pipeline.GenbankWriter',
        'deepbgc.command.pipeline.BGCGenbankWriter',
        'deepbgc.command.pipeline.ReadmeWriter'
        # Note: We are mocking classes imported in deepbgc.command.pipeline, not at their original location!
    ]
    writers = [mocker.patch(path) for path in writer_paths]

    report_dir = os.path.join(tmpdir, 'report')
    report_tmp_dir = os.path.join(report_dir, 'tmp')
    run([
        'pipeline', '--output', report_dir, '--detector', 'mydetector',
        '--label', 'mylabel', '--score', '0.1', '--merge-max-protein-gap', '8',
        '--merge-max-nucl-gap', '9', '--min-nucl', '10', '--min-proteins',
        '20', '--min-domains', '30', '--min-bio-domains', '40', '--classifier',
        'myclassifier1', '--classifier', 'myclassifier2', '--classifier-score',
        '0.2', 'mySequence.gbk'
    ])

    # Remove logging handlers to avoid affecting other tests
    logger = logging.getLogger('')
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

    os.mkdir.assert_any_call(report_dir)
    os.mkdir.assert_any_call(report_tmp_dir)

    mock_annotator.assert_called_with(tmp_dir_path=report_tmp_dir,
                                      prodigal_meta_mode=False)
    mock_classifier.assert_any_call(classifier='myclassifier1',
                                    score_threshold=0.2)
    mock_classifier.assert_any_call(classifier='myclassifier2',
                                    score_threshold=0.2)
    mock_detector.assert_called_with(detector='mydetector',
                                     label='mylabel',
                                     score_threshold=0.1,
                                     merge_max_protein_gap=8,
                                     merge_max_nucl_gap=9,
                                     min_nucl=10,
                                     min_proteins=20,
                                     min_domains=30,
                                     min_bio_domains=40)

    assert mock_annotator.return_value.run.call_count == 2  # Two records
    assert mock_detector.return_value.run.call_count == 2  # Two records
    assert mock_classifier.return_value.run.call_count == 4  # Two records for each of the two classifiers

    mock_annotator.return_value.print_summary.assert_called_once_with()
    mock_detector.return_value.print_summary.assert_called_once_with()
    assert mock_classifier.return_value.print_summary.call_count == 2  # For each of the two classifiers

    for writer in writers:
        assert writer.return_value.write.call_count == 2  # Two records
        writer.return_value.close.assert_called_once_with()
Ejemplo n.º 8
0
def test_unit_main_help():
    with pytest.raises(SystemExit) as excinfo:
        run(['--help'])
    assert excinfo.value.code == 0
Ejemplo n.º 9
0
def test_unit_main_invalid_command():
    with pytest.raises(SystemExit) as excinfo:
        run(['invalid'])
    assert excinfo.value.code == 2
Ejemplo n.º 10
0
def test_unit_prepare_help():
    with pytest.raises(SystemExit) as excinfo:
        run(['prepare', '--help'])
    assert excinfo.value.code == 0