Esempio n. 1
0
def test_prepare(capsys):
    text_field = 'text field'
    class_field = 'class field'
    quantity = 2
    fields = {text_field: 'Teste value.', class_field: 'c1'}
    analyzed_sentences = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * quantity
    }
    docs1 = [
        Document(index=0, fields=fields,
                 analyzed_sentences=analyzed_sentences),
        Document(index=1, fields=fields, analyzed_sentences=None),
    ]
    synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt']
    expected_corpus_str = [[' '.join(['teste value'] * quantity), ''],
                           [' '.join(['prova value'] * quantity), '']]
    expected_classifications = [[fields[class_field]] * quantity
                                ] * len(synonyms_files)
    expected_idxs_to_remove = [[1]] * len(synonyms_files)
    expected_corpus = [[['teste', 'value'] * quantity, []],
                       [['prova', 'value'] * quantity, []]]
    try:
        filename = generate_available_filename()
        pickle_manager.dump_documents(docs1, filename)
        for i, synonyms_file in enumerate(synonyms_files):
            ft = FeatureExtractor(synonyms_file=synonyms_file)
            for training_mode in [True, False]:
                corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare(
                    text_field, class_field, None, docs1, training_mode)
                corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare(
                    text_field, class_field, filename, None, training_mode)
                assert (corpus_str1, classifications1, idxs_to_remove1,
                        corpus1) == (corpus_str2, classifications2,
                                     idxs_to_remove2, corpus2)
                assert corpus_str1 == expected_corpus_str[i]
                assert classifications1 == expected_classifications[i]
                assert idxs_to_remove1 == expected_idxs_to_remove[i]
                assert corpus1 == expected_corpus[i]
                captured = capsys.readouterr()
                assert captured.out == ''
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Preparing to create classification: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
            if synonyms_file is not None:
                remove_and_check(synonyms_file)
    finally:
        remove_and_check(filename)
Esempio n. 2
0
def test__generate_file():
    try:
        filename = pickle_manager._generate_file()
        assert exists(filename)
        assert pytest.approx(getctime(filename)) == pytest.approx(time())
    finally:
        remove_and_check(filename)
Esempio n. 3
0
def test_dump_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    try:
        path = create_temporary_file(content=None, text=False)
        with pytest.raises(Exception):
            pickle_manager.dump_documents(docs1, path)
    finally:
        remove_and_check(path)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
        docs2 = list(pickle_manager.get_documents(filename))
        assert len(metadata) == 1
        assert metadata['total'] == len(docs1)
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
    finally:
        remove_and_check(filename)
    captured = capsys.readouterr()
    assert captured.out == ''
    assert captured.err[captured.err.rfind('\r') +
                        1:].startswith('Storing documents: 100%|')
    assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
        's/doc]\n')
Esempio n. 4
0
def test_check_data(capsys):
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        pickle_manager.check_data(filename)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Checking data: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        count = 10
        metadata1 = {'total': count}
        pda1 = pickle_manager.PickleDumpAppend(metadata1, filename)
        for not_Document in range(count):
            pda1.dump_append(not_Document)
        pda1.close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
        metadata2 = {'total': -1}
        pickle_manager.PickleDumpAppend(metadata2, filename).close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
    finally:
        remove_and_check(filename)
def test_generate_report():
    execution_info = pd.DataFrame.from_dict({
        'Start': [functions.get_local_time_str()],
        'End': [functions.get_local_time_str()],
    })
    parameters_dict = Parameters(utils.config_file).__dict__
    predictions_dict = {
        'y_true': ['label1'],
        'classifier_key': [{'label1': 0.0, 'label2': 1.0}],
    }
    parameters_dict['set_num_accepted_probs'] = 1
    expected_df_row0 = pd.concat([
        execution_info,
        functions.parameters_to_data_frame(parameters_dict),
        functions.predictions_to_data_frame(predictions_dict, 1),
    ], axis=1)
    parameters_dict['set_num_accepted_probs'] = {1}
    excel_file1 = utils.generate_available_filename(ext='.xlsx')
    excel_file2 = utils.generate_available_filename(ext='.xlsx')
    expected_df = pd.DataFrame()
    try:
        for i, file_exists in enumerate([False, True]):
            assert exists(excel_file1) is file_exists
            df = functions.generate_report(execution_info, parameters_dict, predictions_dict, excel_file1)
            df.to_excel(excel_file2, index=False)
            assert df.shape == (i + 1, 44)
            expected_df = pd.concat([expected_df, expected_df_row0])
            pd.util.testing.assert_frame_equal(df, expected_df)
            pd.util.testing.assert_frame_equal(pd.read_excel(excel_file1), pd.read_excel(excel_file2))
    finally:
        utils.remove_and_check(excel_file1)
        utils.remove_and_check(excel_file2)
Esempio n. 6
0
def test_dump_and_load():
    obj1 = random()
    try:
        path = create_temporary_file(content=None, text=False)
        pickle_manager.dump(obj1, path)
        obj2 = pickle_manager.load(path)
    finally:
        remove_and_check(path)
    assert obj1 == obj2
Esempio n. 7
0
def test_main(monkeypatch):
    parameters = Parameters(utils.config_file)
    with pytest.raises(SystemExit):
        prediction_server.main(parameters, 1024)
    with monkeypatch.context() as m:
        m.setattr("gevent.pywsgi.WSGIServer.serve_forever",
                  lambda stop_timeout: None)
        try:
            vectorizer_file = 'vectorizer.pkl'
            dump(
                FeatureExtractor(vectorizer_name='TfidfVectorizer').vectorizer,
                vectorizer_file)
            assert prediction_server._old_handlers == dict()
            assert prediction_server.logger.disabled is False
            prediction_server.main(parameters, 1025)
            assert prediction_server.logger.disabled is True
            assert prediction_server._text_field == 'Example column'
            assert prediction_server._class_field == 'Classification column'
            assert prediction_server._preprocessor.mosestokenizer_language_code == 'en'
            assert prediction_server._preprocessor.store_data is False
            assert prediction_server._preprocessor.spell_checker is None
            #assert prediction_server._preprocessor.spell_checker.hunspell.max_threads == cpu_count()
            assert len(prediction_server._feature_extractor.stop_words) > 0
            assert prediction_server._feature_extractor.feature_reduction is None
            assert prediction_server._feature_extractor.document_adjustment_code.__file__ == abspath(
                'text_categorizer/document_updater.py')
            assert prediction_server._feature_extractor.synonyms is None
            assert prediction_server._feature_extractor.vectorizer_file == vectorizer_file
            assert prediction_server._feature_extractor.n_jobs == cpu_count()
            assert prediction_server._old_handlers == dict()
            m.setattr(
                "text_categorizer.prediction_server._reset_signal_handlers",
                lambda: None)
            prediction_server.main(parameters, 1025)
            assert len(prediction_server._old_handlers) == 1
            prediction_server._old_handlers.clear()
            assert type(prediction_server.app.wsgi_app) is WSGIServer
            assert prediction_server.app.wsgi_app.started is False
            assert prediction_server.app.wsgi_app.closed is True
            for sig in constants.stop_signals:
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is True
            for sig in constants.stop_signals * 2:
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is True
            for sig in [signal.SIGILL]:
                assert sig not in constants.stop_signals
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is False
        finally:
            utils.remove_and_check(vectorizer_file)
Esempio n. 8
0
def test_dump_json():
    d1 = {'test_random_values': [np.random.random()]}
    filename = utils.generate_available_filename()
    try:
        trainer.dump_json(d1, filename)
        f = open(filename, 'r')
        d2 = json.load(f)
    finally:
        f.close()
        utils.remove_and_check(filename)
    assert d1 == d2
Esempio n. 9
0
def test_get_docs_metadata():
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
    finally:
        remove_and_check(filename)
    assert type(metadata) is dict
    assert len(metadata) == 1
    assert metadata['total'] == len(docs)
Esempio n. 10
0
def test_generate_roc_plot():
    filename = '%s.png' % (generate_available_filename())
    for n_class in [2, 10]:
        X_test, y_test = load_digits(n_class=n_class, return_X_y=True)
        for f in clfs:
            clf = f(n_jobs=1, class_weight=None)
            clf.fit(X_test, y_test)
            assert not exists(filename)
            try:
                classifiers.generate_roc_plot(clf, X_test, y_test, filename)
                assert exists(filename)
            finally:
                remove_and_check(filename)
Esempio n. 11
0
def test_PickleDumpAppend_close():
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        for expected_value in [False, True]:
            assert exists(filename) == expected_value
            pda = pickle_manager.PickleDumpAppend(metadata, filename)
            assert not pda.file.closed
            assert exists(pda.file.name)
            pda.close()
            assert pda.file.closed
            assert not exists(pda.file.name)
    finally:
        remove_and_check(filename)
def test___init__():
    filename = generate_available_filename()
    try:
        assert not exists(filename)
        parser = ContoPTParser(filename)
        assert exists(filename)
        synonyms = ContoPTParser._load_synonyms(filename)
    finally:
        remove_and_check(filename)
    assert type(parser.synonyms) is dict
    assert len(parser.synonyms) > 0
    assert parser.synonyms == synonyms
    assert synonyms['adjudicatário'] == 'adjudicante'
    assert synonyms['melancolia'] == 'misantropia'
    assert synonyms['tristeza'] == 'misantropia'
Esempio n. 13
0
def test_PickleDumpAppend_dump_append():
    count = 10
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        pda = pickle_manager.PickleDumpAppend(metadata, filename)
        for i in range(count):
            pda.dump_append(i)
        pda.close()
        input_file = open(filename, 'rb')
        assert load(input_file) == metadata
        for i in range(count):
            data = load(input_file)
            assert data == i
    finally:
        input_file.close()
        remove_and_check(filename)
Esempio n. 14
0
def test_main():
    old_dir = os.getcwd()
    new_dir = utils.generate_available_filename()
    base_parameters = Parameters(utils.config_file)
    base_parameters.preprocessed_data_file = os.path.basename(base_parameters.preprocessed_data_file)
    try:
        os.makedirs(new_dir, exist_ok=False)
        os.chdir(new_dir)
        parameters = deepcopy(base_parameters)
        parameters.excel_file = "invalid_excel_file"
        parameters.preprocessed_data_file = "invalid_data_file"
        with pytest.raises(SystemExit):
            trainer.main(parameters)
        parameters = deepcopy(base_parameters)
        assert not os.path.exists(parameters.preprocessed_data_file)
        try:
            trainer.main(parameters)
            assert os.path.exists(parameters.preprocessed_data_file)
            assert os.path.exists("predictions.json")
            assert os.path.exists("report.xlsx")
        finally:
            utils.remove_and_check(parameters.preprocessed_data_file)
            utils.remove_and_check("predictions.json")
            utils.remove_and_check("report.xlsx")
        parameters.excel_file = os.path.abspath("20newsgroups")
        parameters.preprocess_data = False
        excel_file_20newsgroups = "20newsgroups.xlsx"
        assert not os.path.exists(excel_file_20newsgroups)
        try:
            trainer.main(parameters)
            pytest.fail()
        except SystemExit:
            assert os.path.exists(excel_file_20newsgroups)
        finally:
            utils.remove_and_check(excel_file_20newsgroups)
        parameters = deepcopy(base_parameters)
        parameters.final_training = True
        try:
            trainer.main(parameters)
        finally:
            assert not os.path.exists("predictions.json")
            assert not os.path.exists("report.xlsx")
            utils.remove_and_check(parameters.preprocessed_data_file)
    finally:
        os.chdir(old_dir)
        rmtree(new_dir)
Esempio n. 15
0
def test_get_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        dump_documents(docs1, filename)
        for d1, d2 in [(None, '100%|'),
                       ('Loading documents', 'Loading documents: 100%|')]:
            docs2 = list(get_documents(filename, description=d1))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            captured = capsys.readouterr()
            assert captured.out == ''
            assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2)
            assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
                's/doc]\n')
    finally:
        remove_and_check(filename)
Esempio n. 16
0
def test_PickleDumpAppend___init__():
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        not_dict = 'test_str'
        not_str = -1
        params = [[not_dict, filename], [metadata, not_str]]
        for m, f in params:
            with pytest.raises(AssertionError):
                pda = pickle_manager.PickleDumpAppend(m, f)
        pda = pickle_manager.PickleDumpAppend(metadata, filename)
        assert pda.filename_upon_completion == filename
        assert exists(pda.file.name)
        pda.close()
        assert pickle_manager.load(filename) == metadata
        assert not exists(pda.file.name)
        assert exists(filename)
    finally:
        remove_and_check(filename)
def test_predictions_to_data_frame():
    predictions_dict1 = {
        'y_true': ['0', '1', '0', '1', '0'],
        'RandomForestClassifier': [{'0': 1., '1': 0.}, {'0': 1., '1': 0.}, \
            {'0': 0., '1': 1.}, {'0': 0., '1': 1.}, {'0': 1., '1': 0.}],
        'LinearSVC': [{'0': 1., '1': 0.}, {'0': 0., '1': 1.}, \
            {'0': 1., '1': 0.}, {'0': 0., '1': 1.}, {'0': 0., '1': 1.}]
    }
    columns = [
        '%s %s %s' % (metric, clf, label)
        for metric in ['f1-score', 'precision', 'recall', 'support']
        for clf in ['LinearSVC', 'RandomForestClassifier']
        for label in [0, 1, 'macro avg', 'micro avg', 'weighted avg']
    ]
    data1 = [
        0.8, 0.8, 0.8, 0.8000000000000002, 0.8,
        0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6,
        1.0, 0.6666666666666666, 0.8333333333333333, 0.8, 0.8666666666666666,
        0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6,
        0.6666666666666666, 1.0, 0.8333333333333333, 0.8, 0.8,
        0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6,
        3, 2, 5, 5, 5,
        3, 2, 5, 5, 5
    ]
    data2 = data1.copy()
    data2[0:30] = [1.] * 30
    expected_df1 = pd.DataFrame(data=[data1], columns=columns)
    expected_df2 = pd.DataFrame(data=[data2], columns=columns)
    try:
        path = utils.create_temporary_file(content=None, text=True)
        trainer.dump_json(predictions_dict1, path)
        f = open(path, 'r')
        predictions_dict2 = json.load(f)
        f.close()
    finally:
        utils.remove_and_check(path)
    df1 = functions.predictions_to_data_frame(predictions_dict2, 1)
    df2 = functions.predictions_to_data_frame(predictions_dict2, 2)
    assert predictions_dict1 == predictions_dict2
    pd.util.testing.assert_frame_equal(df1, expected_df1)
    pd.util.testing.assert_frame_equal(df2, expected_df2)
Esempio n. 18
0
def test_load_20newsgroups():
    p1 = Parameters(utils.config_file)
    p1.excel_file = '20newsgroups'
    excel_file = utils.generate_available_filename('.xlsx')
    try:
        p2 = trainer.load_20newsgroups(p1, excel_file)
        assert p1 is not p2
        assert p1 != p2
        assert p2.excel_column_with_text_data == 'data'
        assert p2.excel_column_with_classification_data == 'target'
        assert os.path.exists(excel_file)
        df = pd.read_excel(excel_file)
        assert df.shape == (18846, 3)
        assert list(df.keys()) == ['Unnamed: 0', 'data', 'target']
        expected_mtime = os.path.getmtime(excel_file)
        p3 = trainer.load_20newsgroups(p1, excel_file)
        assert os.path.getmtime(excel_file) == expected_mtime
        assert p3.__dict__ == p2.__dict__
    finally:
        utils.remove_and_check('20news-bydate_py3.pkz')
        utils.remove_and_check(excel_file)
Esempio n. 19
0
def test_set_docs_metadata(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata1 = pickle_manager.get_docs_metadata(filename)
        metadata2 = metadata1.copy()
        metadata2['new_field'] = 'test_field_value'
        assert metadata1 != metadata2
        pickle_manager.set_docs_metadata(metadata2, filename)
        assert pickle_manager.get_docs_metadata(filename) == metadata2
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n'
    finally:
        remove_and_check(filename)
Esempio n. 20
0
def test_LatentDirichletAllocation():
    X = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    y = np.asarray([0, 1, 2])
    expected_new_X = np.asarray([[
        0.01428573, 0.87142845, 0.01428573, 0.01428573, 0.01428573, 0.01428573,
        0.01428573, 0.01428573, 0.01428573, 0.01428573
    ],
                                 [
                                     0.00625001, 0.94374995, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001
                                 ],
                                 [
                                     0.00400000, 0.96399997, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000
                                 ]])
    filename = generate_available_filename()
    assert not exists(filename)
    try:
        new_X1, new_y1 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert exists(filename)
        assert new_X1.shape == (X.shape[0], 10)
        assert np.allclose(expected_new_X, new_X1)
        assert np.array_equal(y, new_y1)
        mtime = getmtime(filename)
        new_X2, new_y2 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert getmtime(filename) == mtime
        assert np.array_equal(new_X1, new_X2)
        assert np.array_equal(new_y1, new_y2)
    finally:
        remove_and_check(filename)
Esempio n. 21
0
def test_get_documents():
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        try:
            f = open(filename, 'ab')
            dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol)
            f.close()
            docs2 = list(pickle_manager.get_documents(filename))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            pytest.fail()
        except Exception as e:
            assert len(e.args) == 1
            assert e.args[
                0] == "The file '%s' has more documents than indicated in the metadata." % (
                    filename)
    finally:
        remove_and_check(filename)
Esempio n. 22
0
def test_generate_X_y(capsys):
    quantity = 2
    corpus = ['Test lemma 1 . ' * quantity, 'Test lemma 2 . ' * quantity]
    classifications = [1, 2]
    filename = generate_available_filename()
    dpe_out = 'Please, ignore the message above indicating that the sentence is too long. The problem has been solved.\n' * 6
    combinations = [
        ('CountVectorizer', None, True, ''),
        ('CountVectorizer', 'LDA', True, ''),
        ('CountVectorizer', 'MDS', True, ''),
        ('HashingVectorizer', None, True, ''),
        ('HashingVectorizer', 'MDS', True, ''),
        ('TfidfVectorizer', None, True, ''),
        ('TfidfVectorizer', 'LDA', True, ''),
        ('TfidfVectorizer', 'MDS', True, ''),
        ('DocumentPoolEmbeddings', None, False, dpe_out),
        ('DocumentPoolEmbeddings', 'MDS', False, dpe_out),
    ]
    for vectorizer, fr, expect_file, expected_out in combinations:
        try:
            ft = FeatureExtractor(vectorizer_name=vectorizer,
                                  feature_reduction=fr,
                                  vectorizer_file=filename)
            for training_mode in [True, False]:
                assert exists(filename) is (not training_mode and expect_file)
                _X, y = ft.generate_X_y(corpus, classifications, training_mode)
                assert exists(filename) is expect_file
                assert y == classifications
                captured = capsys.readouterr()
                assert captured.out == expected_out
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Extracting features: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
        finally:
            if expect_file:
                remove_and_check(filename)
            if fr == 'LDA':
                remove_and_check('LatentDirichletAllocation.pkl')
    with pytest.raises(ValueError):
        FeatureExtractor(feature_reduction='invalid',
                         vectorizer_file=filename).generate_X_y(
                             corpus, classifications)
    remove_and_check(filename)
Esempio n. 23
0
def test_Pipeline_start():
    predict_probas_linux = {
        'RandomForestClassifier': [
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
        ],
        'BernoulliNB': [
            [1.0, 5.9253907982022474e-18, 9.24592247679012e-21],
            [5.086117678607322e-14, 0.9999999417850541, 5.821489476394197e-08],
        ],
        'MultinomialNB': [
            [1.0, 3.987155612430403e-87, 1.9843977254102716e-103],
            [1.1638109881136655e-141, 1.0, 4.902906597402722e-42],
        ],
        'ComplementNB': [
            [1.0, 1.244018908413837e-57, 2.372151728763692e-55],
            [1.2983800585685595e-35, 1.0, 3.836692075297123e-24],
        ],
        'KNeighborsClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'MLPClassifier': [
            [0.9999992330465266, 2.108350674827178e-08, 7.458699665987544e-07],
            [6.949799904570786e-10, 0.9999171940556058, 8.280524941418183e-05],
        ],
        'LinearSVC': [
            [0.8995782143576087, 0.02511044323694783, 0.07531134240544347],
            [0.03561932795252063, 0.9407083426933305, 0.023672329354149018],
        ],
        'DecisionTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'ExtraTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'DummyClassifier': [[0, 0, 1], [1, 0, 0]],
        'SGDClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'BaggingClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
    }
    predict_probas_windows = {
        'ComplementNB': [
            [1.0, 1.2440189084141198e-57, 2.3721517287642315e-55],
            [1.2983800585685595e-35, 1.0, 3.836692075297123e-24],
        ],
        'MLPClassifier': [
            [0.9999992330465266, 2.108350674827178e-08, 7.458699665987557e-07],
            [6.949799904570761e-10, 0.9999171940556058, 8.280524941418183e-05],
        ],
        'LinearSVC': [
            [0.8995692949536029, 0.025113499736912265, 0.07531720530948487],
            [0.0356197780956943, 0.9407082394988142, 0.02367198240549154],
        ],
    }
    p = classifiers.Pipeline(clfs)
    clfs_names = [f.__name__ for f in p.classifiers]
    clfs_files = ['%s.pkl' % (clf_name) for clf_name in clfs_names]
    roc_files = ['ROC_%s.png' % (clf_name) for clf_name in clfs_names]
    X, y = load_digits(n_class=3, return_X_y=True)
    y = y.tolist()
    assert all([not exists(clf_name) for clf_name in clfs_names])
    try:
        predictions = p.start(X, y, X, y, -1, {1, 2, 2, 4})
        for clf_name, clf_file in zip_longest(clfs_names, clfs_files):
            predict_proba = [list(d.values()) for d in predictions[clf_name][0:2]]
            assert np.array_equal(predict_probas_linux[clf_name], predict_proba) \
                    or np.array_equal(predict_probas_windows[clf_name], predict_proba)
            assert exists(clf_file)
            clf = pickle_manager.load(clf_file)
            if 'n_jobs' in dir(clf):
                assert clf.n_jobs == -1
            if 'class_weights' in dir(clf):
                assert clf.class_weights is None
        assert all([not exists(roc_file) for roc_file in roc_files])
        p.start(X, y, X, y, -1, {1, 2, 2, 4}, 'balanced')
        for clf_file in clfs_files:
            clf = pickle_manager.load(clf_file)
            if 'class_weights' in dir(clf):
                assert clf.class_weights == 'balanced'
        p.start(X, y, X, y, -1, {1, 2, 2, 4}, None, True)
        assert all([exists(roc_file) for roc_file in roc_files])
        classifiers.Pipeline([FailClassifier]).start(X, y, X, y)
        predictions = p.start(X, y)
        assert predictions == {'y_true': []}
        with pytest.raises(AssertionError):
            p.start(X, y, X, [])
    finally:
        for clf_file in clfs_files:
            remove_and_check(clf_file)
        for roc_file in roc_files:
            remove_and_check(roc_file)
Esempio n. 24
0
def test_predict(client):
    df = read_excel(utils.example_excel_file)
    docs = data_frame_to_document_list(df)
    prediction_server._text_field = 'Example column'
    prediction_server._class_field = 'Classification column'
    clfs_filenames = []
    try:
        vectorizer_path = utils.create_temporary_file(content=None, text=False)
        p = Preprocessor()
        p.preprocess(text_field=prediction_server._text_field,
                     preprocessed_data_file=None,
                     docs=docs)
        ft = FeatureExtractor(training_mode=True,
                              vectorizer_file=vectorizer_path)
        corpus, classifications, _, _ = ft.prepare(
            text_field=prediction_server._text_field,
            class_field=prediction_server._class_field,
            preprocessed_data_file=None,
            docs=docs,
            training_mode=True)
        X, y = ft.generate_X_y(corpus, classifications, training_mode=True)
        prediction_server._preprocessor = Preprocessor()
        prediction_server._feature_extractor = FeatureExtractor(
            training_mode=False, vectorizer_file=vectorizer_path)
        res = client.post('/', headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        res = client.post('/',
                          json={
                              'text': 1,
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid text</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 1
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': '../LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(
            res.data).endswith('<p>Invalid classifier model</p>\n')
        for f in clfs:
            clf_filename_base = utils.generate_available_filename()
            clf_filename = '%s.pkl' % (clf_filename_base)
            clfs_filenames.append(clf_filename)
            clf = f(n_jobs=1, class_weight=None)
            clf.fit(X, y)
            dump(clf, clf_filename)
            res = client.post('/',
                              json={
                                  'text': 'Test text.',
                                  'classifier': clf_filename_base
                              },
                              headers=valid_headers)
            assert res.status_code == 200
            assert repr(
                prediction_server._classifiers[clf_filename_base]) == repr(clf)
            assert replace_final_dict_values(res.json, value=0) in [{
                'feature_weights': {
                    'I': {},
                    'II': {},
                    'III': {}
                },
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }, {
                'feature_weights': {},
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }]
    finally:
        utils.remove_and_check(vectorizer_path)
        for clf_filename in clfs_filenames:
            utils.remove_and_check(clf_filename)
        prediction_server._text_field = None
        prediction_server._class_field = None
        prediction_server._preprocessor = None
        prediction_server._feature_extractor = None
        prediction_server._feature_weights = dict()
        prediction_server._classifiers = dict()
        prediction_server._old_handlers = dict()
Esempio n. 25
0
def test_train_test_split():
    text_field = 'Example column'
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    preprocessor = Preprocessor()
    preprocessor.preprocess(text_field, None, docs)
    ft = FeatureExtractor()
    corpus, classifications, _, _ = ft.prepare(
        text_field=text_field,
        class_field='Classification column',
        preprocessed_data_file=None,
        docs=docs,
        training_mode=False)
    test_size = 0.3
    preprocessed_data_file = generate_available_filename()
    force = False
    idxs_to_remove = [5]
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        assert pickle_manager.get_docs_metadata(preprocessed_data_file) == {
            'total': 10
        }
        desired = {
            'total': 10,
            'test_size': test_size,
            'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]),
            'test_set_indexes': np.array([7, 9, 4])
        }
        for my_force in [False, True]:
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, my_force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key in ['test_size', 'training_set_indexes', 'test_set_indexes']:
            m = desired.copy()
            m[key] = None
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key, value in [('test_size', 0.2),
                           ('training_set_indexes', np.array([1, 0, 2, 8,
                                                              3]))]:
            m = desired.copy()
            m[key] = value
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file), m)
    finally:
        remove_and_check(preprocessed_data_file)
    pass
Esempio n. 26
0
def test___init__():
    ft1 = FeatureExtractor()
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        pytest.fail()
    assert ft1.stop_words == set()
    assert ft1.vectorizer_file == 'vectorizer.pkl'
    assert type(ft1.vectorizer) is feature_extraction.text.TfidfVectorizer
    assert ft1.feature_reduction is None
    assert 'initial_code_to_run_on_document' in dir(
        ft1.document_adjustment_code)
    assert ft1.upostags_to_ignore == ['PUNCT']
    assert ft1.synonyms is None
    assert ft1.n_jobs == 1
    ft2 = FeatureExtractor(nltk_stop_words_package='english')
    assert ft2.stop_words == set(nltk.corpus.stopwords.words('english'))
    for vectorizer_name in [
            'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer'
    ]:
        with pytest.raises(FileNotFoundError):
            FeatureExtractor(vectorizer_name=vectorizer_name,
                             training_mode=False,
                             vectorizer_file=generate_available_filename())
        try:
            path = create_temporary_file(content=None, text=False)
            pickle_manager.dump(
                FeatureExtractor(vectorizer_name=vectorizer_name).vectorizer,
                path)
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=False,
                                  vectorizer_file=path)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
        finally:
            remove_and_check(path)
        ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                              training_mode=True)
        assert ft.vectorizer.__class__.__name__ == vectorizer_name
    for vectorizer_name in ['DocumentPoolEmbeddings']:
        for training_mode in [True, False]:
            vectorizer_file = generate_available_filename()
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=training_mode,
                                  vectorizer_file=vectorizer_file)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
            assert not exists(vectorizer_file)
    with pytest.raises(ValueError):
        FeatureExtractor(vectorizer_name='invalid_vectorizer',
                         training_mode=True)
    ft3 = FeatureExtractor(remove_adjectives=True)
    assert ft3.upostags_to_ignore == ['PUNCT', 'ADJ']
    synonyms_file = 'contopt_0.1_r2_c0.0.txt'
    filename = generate_available_filename()
    try:
        ft4 = FeatureExtractor(synonyms_file=synonyms_file)
        contoPTParser = ContoPTParser(filename)
        assert ft4.synonyms == contoPTParser.synonyms
    finally:
        remove_and_check(synonyms_file)
        remove_and_check(filename)
    with pytest.raises(ValueError):
        FeatureExtractor(synonyms_file='invalid_file.txt')
    ft5 = FeatureExtractor(n_jobs=2)
    assert ft5.n_jobs == 2
Esempio n. 27
0
def test_preprocess(capsys):
    text_field = 'Test field'
    index = -1
    fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2}
    analyzed_sentences1 = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tikens',
            'lemma': 'tikens',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    analyzed_sentences2 = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tokens',
            'lemma': 'token',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1),
                                                   ('en_US',
                                                    analyzed_sentences2)]:
        doc = Document(index=index, fields=fields, analyzed_sentences=dict())
        p = Preprocessor(spell_checker_lang=spell_checker_lang)
        assert p.stop is False
        p.preprocess(text_field=text_field,
                     preprocessed_data_file=None,
                     docs=[doc] * 2)
        assert p.stop is False
        assert doc.index == index
        assert doc.fields == fields
        assert doc.analyzed_sentences == analyzed_sentences
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Preprocessing: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        p.stop = True
        with pytest.raises(SystemExit):
            p.preprocess(text_field=text_field,
                         preprocessed_data_file=None,
                         docs=[doc] * 2)
        del (p)
        if spell_checker_lang is not None:
            rmtree('./hunspell')
    docs = [
        Document(index=index, fields=fields, analyzed_sentences=dict())
        for index in range(2)
    ]
    preprocessed_data_file = utils.generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        pickle_manager.check_data(preprocessed_data_file)
        p = Preprocessor(store_data=True)
        assert all([
            doc.analyzed_sentences == dict()
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        p.preprocess(text_field, preprocessed_data_file, None)
        assert all([
            doc.analyzed_sentences == analyzed_sentences1
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        pickle_manager.check_data(preprocessed_data_file)
    finally:
        utils.remove_and_check(preprocessed_data_file)