Example #1
0
def test_prepare(capsys):
    text_field = 'text field'
    class_field = 'class field'
    quantity = 2
    fields = {text_field: 'Teste value.', class_field: 'c1'}
    analyzed_sentences = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * quantity
    }
    docs1 = [
        Document(index=0, fields=fields,
                 analyzed_sentences=analyzed_sentences),
        Document(index=1, fields=fields, analyzed_sentences=None),
    ]
    synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt']
    expected_corpus_str = [[' '.join(['teste value'] * quantity), ''],
                           [' '.join(['prova value'] * quantity), '']]
    expected_classifications = [[fields[class_field]] * quantity
                                ] * len(synonyms_files)
    expected_idxs_to_remove = [[1]] * len(synonyms_files)
    expected_corpus = [[['teste', 'value'] * quantity, []],
                       [['prova', 'value'] * quantity, []]]
    try:
        filename = generate_available_filename()
        pickle_manager.dump_documents(docs1, filename)
        for i, synonyms_file in enumerate(synonyms_files):
            ft = FeatureExtractor(synonyms_file=synonyms_file)
            for training_mode in [True, False]:
                corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare(
                    text_field, class_field, None, docs1, training_mode)
                corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare(
                    text_field, class_field, filename, None, training_mode)
                assert (corpus_str1, classifications1, idxs_to_remove1,
                        corpus1) == (corpus_str2, classifications2,
                                     idxs_to_remove2, corpus2)
                assert corpus_str1 == expected_corpus_str[i]
                assert classifications1 == expected_classifications[i]
                assert idxs_to_remove1 == expected_idxs_to_remove[i]
                assert corpus1 == expected_corpus[i]
                captured = capsys.readouterr()
                assert captured.out == ''
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Preparing to create classification: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
            if synonyms_file is not None:
                remove_and_check(synonyms_file)
    finally:
        remove_and_check(filename)
Example #2
0
def test__filter():
    text_field = 'text field'
    fields = {text_field: 'Teste value.', 'class field': 'c1'}
    doc = Document(index=-1, fields=fields, analyzed_sentences=None)
    upostags_to_ignore = ['PUNCT']
    assert FeatureExtractor._filter(doc, text_field, upostags_to_ignore) == []
    doc.analyzed_sentences = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    assert FeatureExtractor._filter(
        doc, text_field, upostags_to_ignore) == ['test', 'value'] * 2
    upostags_to_ignore.clear()
    assert FeatureExtractor._filter(
        doc, text_field, upostags_to_ignore) == ['test', 'value', '.'] * 2
Example #3
0
def test_MDS():
    X = csr_matrix(np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
    y = np.asarray([0, 1, 2])
    expected_new_X = np.asarray([[0.48395794, 5.12192566],
                                 [1.00066606, -0.1960522],
                                 [-1.484624, -4.92587346]])
    new_X1, new_y1 = FeatureExtractor.MDS(X, y)
    assert new_X1.shape == (X.shape[0], 2)
    assert np.allclose(expected_new_X, new_X1)
    assert np.array_equal(y, new_y1)
    new_X2, new_y2 = FeatureExtractor.MDS(X, y)
    assert np.array_equal(new_X1, new_X2)
    assert np.array_equal(new_y1, new_y2)
Example #4
0
def test_load_feature_weights():
    corpus = [['Test corpus 1.', 'Test corpus 2.'],
              ['Test corpus 1.', 'Test corpus 2.', 'Test corpus 3.']]
    classifications = [[1, 2], [1, 2, 3]]
    value = 0
    lemmas_sets = [{('corpus', value), ('test', value), ('1.', value),
                    ('2.', value)},
                   {('corpus', value), ('test', value), ('1.', value),
                    ('2.', value), ('3.', value)}]
    classes_dicts = [{
        0: lemmas_sets[0]
    }, {
        1: lemmas_sets[1],
        2: lemmas_sets[1],
        3: lemmas_sets[1]
    }]
    fe_hashing = FeatureExtractor(vectorizer_name='HashingVectorizer')
    fe_tfidf = FeatureExtractor(vectorizer_name='TfidfVectorizer')
    for i in range(2):
        X = fe_tfidf.vectorizer.fit_transform(corpus[i])
        lemmas_set = lemmas_sets[i]
        classes_dict = classes_dicts[i]
        expected_values = {
            'RandomForestClassifier': lemmas_set,
            'BernoulliNB': classes_dict,
            'MultinomialNB': classes_dict,
            'ComplementNB': classes_dict,
            'KNeighborsClassifier': set(),
            'MLPClassifier': set(),
            'LinearSVC_proba': classes_dict,
            'DecisionTreeClassifier': lemmas_set,
            'ExtraTreeClassifier': lemmas_set,
            'DummyClassifier': set(),
            'SGDClassifier': classes_dict,
            'BaggingClassifier': set()
        }
        assert len(clfs) == len(expected_values)
        for f in clfs:
            clf = f(n_jobs=1, class_weight=None)
            clf_name = clf.__class__.__name__
            clf.fit(X, classifications[i])
            prediction_server._feature_extractor = fe_hashing
            assert prediction_server.load_feature_weights(clf) == set()
            clf.fit(X, classifications[i])
            prediction_server._feature_extractor = fe_tfidf
            assert replace_tuples_values(
                prediction_server.load_feature_weights(clf),
                value=value) == expected_values[clf_name]
Example #5
0
def test_main(monkeypatch):
    parameters = Parameters(utils.config_file)
    with pytest.raises(SystemExit):
        prediction_server.main(parameters, 1024)
    with monkeypatch.context() as m:
        m.setattr("gevent.pywsgi.WSGIServer.serve_forever",
                  lambda stop_timeout: None)
        try:
            vectorizer_file = 'vectorizer.pkl'
            dump(
                FeatureExtractor(vectorizer_name='TfidfVectorizer').vectorizer,
                vectorizer_file)
            assert prediction_server._old_handlers == dict()
            assert prediction_server.logger.disabled is False
            prediction_server.main(parameters, 1025)
            assert prediction_server.logger.disabled is True
            assert prediction_server._text_field == 'Example column'
            assert prediction_server._class_field == 'Classification column'
            assert prediction_server._preprocessor.mosestokenizer_language_code == 'en'
            assert prediction_server._preprocessor.store_data is False
            assert prediction_server._preprocessor.spell_checker is None
            #assert prediction_server._preprocessor.spell_checker.hunspell.max_threads == cpu_count()
            assert len(prediction_server._feature_extractor.stop_words) > 0
            assert prediction_server._feature_extractor.feature_reduction is None
            assert prediction_server._feature_extractor.document_adjustment_code.__file__ == abspath(
                'text_categorizer/document_updater.py')
            assert prediction_server._feature_extractor.synonyms is None
            assert prediction_server._feature_extractor.vectorizer_file == vectorizer_file
            assert prediction_server._feature_extractor.n_jobs == cpu_count()
            assert prediction_server._old_handlers == dict()
            m.setattr(
                "text_categorizer.prediction_server._reset_signal_handlers",
                lambda: None)
            prediction_server.main(parameters, 1025)
            assert len(prediction_server._old_handlers) == 1
            prediction_server._old_handlers.clear()
            assert type(prediction_server.app.wsgi_app) is WSGIServer
            assert prediction_server.app.wsgi_app.started is False
            assert prediction_server.app.wsgi_app.closed is True
            for sig in constants.stop_signals:
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is True
            for sig in constants.stop_signals * 2:
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is True
            for sig in [signal.SIGILL]:
                assert sig not in constants.stop_signals
                prediction_server.app.wsgi_app.start()
                assert prediction_server.app.wsgi_app.started is True
                prediction_server._signal_handler(sig=sig, frame=None)
                assert prediction_server.app.wsgi_app.closed is False
        finally:
            utils.remove_and_check(vectorizer_file)
Example #6
0
def test_generate_X_y(capsys):
    quantity = 2
    corpus = ['Test lemma 1 . ' * quantity, 'Test lemma 2 . ' * quantity]
    classifications = [1, 2]
    filename = generate_available_filename()
    dpe_out = 'Please, ignore the message above indicating that the sentence is too long. The problem has been solved.\n' * 6
    combinations = [
        ('CountVectorizer', None, True, ''),
        ('CountVectorizer', 'LDA', True, ''),
        ('CountVectorizer', 'MDS', True, ''),
        ('HashingVectorizer', None, True, ''),
        ('HashingVectorizer', 'MDS', True, ''),
        ('TfidfVectorizer', None, True, ''),
        ('TfidfVectorizer', 'LDA', True, ''),
        ('TfidfVectorizer', 'MDS', True, ''),
        ('DocumentPoolEmbeddings', None, False, dpe_out),
        ('DocumentPoolEmbeddings', 'MDS', False, dpe_out),
    ]
    for vectorizer, fr, expect_file, expected_out in combinations:
        try:
            ft = FeatureExtractor(vectorizer_name=vectorizer,
                                  feature_reduction=fr,
                                  vectorizer_file=filename)
            for training_mode in [True, False]:
                assert exists(filename) is (not training_mode and expect_file)
                _X, y = ft.generate_X_y(corpus, classifications, training_mode)
                assert exists(filename) is expect_file
                assert y == classifications
                captured = capsys.readouterr()
                assert captured.out == expected_out
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Extracting features: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
        finally:
            if expect_file:
                remove_and_check(filename)
            if fr == 'LDA':
                remove_and_check('LatentDirichletAllocation.pkl')
    with pytest.raises(ValueError):
        FeatureExtractor(feature_reduction='invalid',
                         vectorizer_file=filename).generate_X_y(
                             corpus, classifications)
    remove_and_check(filename)
Example #7
0
def test_get_feature_weights():
    corpus = ['Test corpus 1.', 'Test corpus 2.', 'Test corpus 3.']
    classifications = [1, 2, 3]
    fe = FeatureExtractor()
    X = fe.vectorizer.fit_transform(corpus)
    prediction_server._feature_extractor = fe
    lemmas = ['test', 'corpus', '1.']
    value = 0
    lemmas_dict = {'1.': value, 'corpus': value, 'test': value}
    classes_dict = {1: lemmas_dict, 2: lemmas_dict, 3: lemmas_dict}
    expected_values_1 = {
        'RandomForestClassifier': lemmas_dict,
        'BernoulliNB': classes_dict,
        'MultinomialNB': classes_dict,
        'ComplementNB': classes_dict,
        'KNeighborsClassifier': dict(),
        'MLPClassifier': dict(),
        'LinearSVC_proba': classes_dict,
        'DecisionTreeClassifier': lemmas_dict,
        'ExtraTreeClassifier': lemmas_dict,
        'DummyClassifier': dict(),
        'SGDClassifier': classes_dict,
        'BaggingClassifier': dict()
    }
    lemmas_set = {('corpus', value), ('test', value), ('1.', value),
                  ('2.', value), ('3.', value)}
    classes_dict = {1: lemmas_set, 2: lemmas_set, 3: lemmas_set}
    expected_values_2 = {
        'RandomForestClassifier': lemmas_set,
        'BernoulliNB': classes_dict,
        'MultinomialNB': classes_dict,
        'ComplementNB': classes_dict,
        'KNeighborsClassifier': set(),
        'MLPClassifier': set(),
        'LinearSVC_proba': classes_dict,
        'DecisionTreeClassifier': lemmas_set,
        'ExtraTreeClassifier': lemmas_set,
        'DummyClassifier': set(),
        'SGDClassifier': classes_dict,
        'BaggingClassifier': set()
    }
    assert len(clfs) == len(expected_values_1)
    assert len(clfs) == len(expected_values_2)
    for f in clfs:
        clf = f(n_jobs=1, class_weight=None)
        clf_name = clf.__class__.__name__
        clf.fit(X, classifications)
        expected_value_1 = expected_values_1[clf_name]
        expected_value_2 = expected_values_2[clf_name]
        assert prediction_server._feature_weights.get(clf_name) is None
        fw1 = prediction_server.get_feature_weights(clf,
                                                    lemmas)['feature_weights']
        assert replace_final_dict_values(fw1, value=value) == expected_value_1
        fw2 = prediction_server._feature_weights[clf_name]
        assert replace_tuples_values(fw2, value=value) == expected_value_2
Example #8
0
def main(parameters, port):
    global _text_field, _class_field, _preprocessor, _feature_extractor
    limit_port = 1024
    if port <= limit_port:
        print("Please, indicate a port higher than %s." % (limit_port))
        quit()
    logger.disabled = True
    _text_field = parameters.excel_column_with_text_data
    _class_field = parameters.excel_column_with_classification_data
    _preprocessor = Preprocessor(mosestokenizer_language_code=parameters.mosestokenizer_language_code, store_data=False, spell_checker_lang=parameters.spell_checker_lang, n_jobs=parameters.number_of_jobs)
    _feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=False, feature_reduction=parameters.feature_reduction, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, n_jobs=parameters.number_of_jobs)
    app.wsgi_app = WSGIServer(('0.0.0.0', port), app.wsgi_app, spawn=Pool(size=None)) # '0.0.0.0' allows access from any network.
    _set_signal_handlers()
    app.wsgi_app.serve_forever()
    _reset_signal_handlers()
Example #9
0
def test_LatentDirichletAllocation():
    X = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    y = np.asarray([0, 1, 2])
    expected_new_X = np.asarray([[
        0.01428573, 0.87142845, 0.01428573, 0.01428573, 0.01428573, 0.01428573,
        0.01428573, 0.01428573, 0.01428573, 0.01428573
    ],
                                 [
                                     0.00625001, 0.94374995, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001
                                 ],
                                 [
                                     0.00400000, 0.96399997, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000
                                 ]])
    filename = generate_available_filename()
    assert not exists(filename)
    try:
        new_X1, new_y1 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert exists(filename)
        assert new_X1.shape == (X.shape[0], 10)
        assert np.allclose(expected_new_X, new_X1)
        assert np.array_equal(y, new_y1)
        mtime = getmtime(filename)
        new_X2, new_y2 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert getmtime(filename) == mtime
        assert np.array_equal(new_X1, new_X2)
        assert np.array_equal(new_y1, new_y2)
    finally:
        remove_and_check(filename)
Example #10
0
def main(parameters):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [functions.get_local_time_str()]
    logger.debug("Starting execution.")
    if basename(parameters.excel_file) == '20newsgroups':
        parameters = load_20newsgroups(parameters)
    if parameters.preprocess_data:
        if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file):
            logger.error("Please, provide a valid Excel file or a valid preprocessed data file.")
            quit()
        if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file):
            logger.info("Loading Excel file.")
            data_frame = pd.read_excel(parameters.excel_file)
            data_frame = data_frame.fillna("NaN")
            logger.info("Creating documents.")
            docs = functions.data_frame_to_document_list(data_frame)
            logger.info("Storing generated documents.")
            pickle_manager.dump_documents(docs, parameters.preprocessed_data_file)
        logger.info("Preprocessing documents.")
        preprocessor = Preprocessor(mosestokenizer_language_code=parameters.mosestokenizer_language_code, store_data=True, spell_checker_lang=parameters.spell_checker_lang, n_jobs=parameters.number_of_jobs)
        preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file)
        logger.info("Checking generated data.")
        pickle_manager.check_data(parameters.preprocessed_data_file)
    else:
        if not isfile(parameters.preprocessed_data_file):
            logger.error("The indicated preprocessed data file does not exist.")
            quit()
    logger.info("Extracting features and splitting dataset into training and test subsets.")
    feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=True, feature_reduction=parameters.feature_reduction, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, n_jobs=parameters.number_of_jobs)
    corpus, classifications, idxs_to_remove, _docs_lemmas = feature_extractor.prepare(text_field=parameters.excel_column_with_text_data, class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file)
    if parameters.final_training:
        X_train, y_train = feature_extractor.generate_X_y(corpus, classifications, training_mode=True)
    else:
        corpus_train, corpus_test, classifications_train, classifications_test = train_test_split(corpus, classifications, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration, idxs_to_remove)
        X_train, y_train = feature_extractor.generate_X_y(corpus_train, classifications_train, training_mode=True)
        X_test, y_test = feature_extractor.generate_X_y(corpus_test, classifications_test, training_mode=False)
    X_train, y_train = resample(parameters.resampling, X_train, y_train)
    logger.info("Running classifiers.")
    p = classifiers.Pipeline(parameters.classifiers)
    logger.info("Accuracies:")
    if parameters.final_training:
        p.start(X_train, y_train, n_jobs=parameters.number_of_jobs, set_n_accepted_probs=parameters.set_num_accepted_probs, class_weight=parameters.class_weights, generate_roc_plots=parameters.generate_roc_plots)
    else:
        predictions_dict = p.start(X_train, y_train, X_test, y_test, parameters.number_of_jobs, parameters.set_num_accepted_probs, parameters.class_weights, parameters.generate_roc_plots)
        dump_json(predictions_dict, 'predictions.json')
    execution_info['End date'] = [functions.get_local_time_str()]
    logger.debug("Execution completed.")
    if not parameters.final_training:
        functions.generate_report(execution_info, parameters.__dict__, predictions_dict)
Example #11
0
def test_train_test_split():
    text_field = 'Example column'
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    preprocessor = Preprocessor()
    preprocessor.preprocess(text_field, None, docs)
    ft = FeatureExtractor()
    corpus, classifications, _, _ = ft.prepare(
        text_field=text_field,
        class_field='Classification column',
        preprocessed_data_file=None,
        docs=docs,
        training_mode=False)
    test_size = 0.3
    preprocessed_data_file = generate_available_filename()
    force = False
    idxs_to_remove = [5]
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        assert pickle_manager.get_docs_metadata(preprocessed_data_file) == {
            'total': 10
        }
        desired = {
            'total': 10,
            'test_size': test_size,
            'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]),
            'test_set_indexes': np.array([7, 9, 4])
        }
        for my_force in [False, True]:
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, my_force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key in ['test_size', 'training_set_indexes', 'test_set_indexes']:
            m = desired.copy()
            m[key] = None
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key, value in [('test_size', 0.2),
                           ('training_set_indexes', np.array([1, 0, 2, 8,
                                                              3]))]:
            m = desired.copy()
            m[key] = value
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file), m)
    finally:
        remove_and_check(preprocessed_data_file)
    pass
Example #12
0
def test__find_incompatible_data_indexes():
    corpus = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    classifications = [20, 20, 22, 22, 24, 24, 26, 27, 28, 28]
    idxs_to_remove = FeatureExtractor._find_incompatible_data_indexes(
        corpus, classifications)
    assert idxs_to_remove == [6, 7]
Example #13
0
def test__generate_corpus():
    lemmas = ['lemma1', 'lemma2']
    corpus = FeatureExtractor._generate_corpus(lemmas)
    assert corpus == ' '.join(lemmas)
Example #14
0
def test___init__():
    ft1 = FeatureExtractor()
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        pytest.fail()
    assert ft1.stop_words == set()
    assert ft1.vectorizer_file == 'vectorizer.pkl'
    assert type(ft1.vectorizer) is feature_extraction.text.TfidfVectorizer
    assert ft1.feature_reduction is None
    assert 'initial_code_to_run_on_document' in dir(
        ft1.document_adjustment_code)
    assert ft1.upostags_to_ignore == ['PUNCT']
    assert ft1.synonyms is None
    assert ft1.n_jobs == 1
    ft2 = FeatureExtractor(nltk_stop_words_package='english')
    assert ft2.stop_words == set(nltk.corpus.stopwords.words('english'))
    for vectorizer_name in [
            'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer'
    ]:
        with pytest.raises(FileNotFoundError):
            FeatureExtractor(vectorizer_name=vectorizer_name,
                             training_mode=False,
                             vectorizer_file=generate_available_filename())
        try:
            path = create_temporary_file(content=None, text=False)
            pickle_manager.dump(
                FeatureExtractor(vectorizer_name=vectorizer_name).vectorizer,
                path)
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=False,
                                  vectorizer_file=path)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
        finally:
            remove_and_check(path)
        ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                              training_mode=True)
        assert ft.vectorizer.__class__.__name__ == vectorizer_name
    for vectorizer_name in ['DocumentPoolEmbeddings']:
        for training_mode in [True, False]:
            vectorizer_file = generate_available_filename()
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=training_mode,
                                  vectorizer_file=vectorizer_file)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
            assert not exists(vectorizer_file)
    with pytest.raises(ValueError):
        FeatureExtractor(vectorizer_name='invalid_vectorizer',
                         training_mode=True)
    ft3 = FeatureExtractor(remove_adjectives=True)
    assert ft3.upostags_to_ignore == ['PUNCT', 'ADJ']
    synonyms_file = 'contopt_0.1_r2_c0.0.txt'
    filename = generate_available_filename()
    try:
        ft4 = FeatureExtractor(synonyms_file=synonyms_file)
        contoPTParser = ContoPTParser(filename)
        assert ft4.synonyms == contoPTParser.synonyms
    finally:
        remove_and_check(synonyms_file)
        remove_and_check(filename)
    with pytest.raises(ValueError):
        FeatureExtractor(synonyms_file='invalid_file.txt')
    ft5 = FeatureExtractor(n_jobs=2)
    assert ft5.n_jobs == 2
Example #15
0
def test_predict(client):
    df = read_excel(utils.example_excel_file)
    docs = data_frame_to_document_list(df)
    prediction_server._text_field = 'Example column'
    prediction_server._class_field = 'Classification column'
    clfs_filenames = []
    try:
        vectorizer_path = utils.create_temporary_file(content=None, text=False)
        p = Preprocessor()
        p.preprocess(text_field=prediction_server._text_field,
                     preprocessed_data_file=None,
                     docs=docs)
        ft = FeatureExtractor(training_mode=True,
                              vectorizer_file=vectorizer_path)
        corpus, classifications, _, _ = ft.prepare(
            text_field=prediction_server._text_field,
            class_field=prediction_server._class_field,
            preprocessed_data_file=None,
            docs=docs,
            training_mode=True)
        X, y = ft.generate_X_y(corpus, classifications, training_mode=True)
        prediction_server._preprocessor = Preprocessor()
        prediction_server._feature_extractor = FeatureExtractor(
            training_mode=False, vectorizer_file=vectorizer_path)
        res = client.post('/', headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        res = client.post('/',
                          json={
                              'text': 1,
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid text</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 1
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': '../LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(
            res.data).endswith('<p>Invalid classifier model</p>\n')
        for f in clfs:
            clf_filename_base = utils.generate_available_filename()
            clf_filename = '%s.pkl' % (clf_filename_base)
            clfs_filenames.append(clf_filename)
            clf = f(n_jobs=1, class_weight=None)
            clf.fit(X, y)
            dump(clf, clf_filename)
            res = client.post('/',
                              json={
                                  'text': 'Test text.',
                                  'classifier': clf_filename_base
                              },
                              headers=valid_headers)
            assert res.status_code == 200
            assert repr(
                prediction_server._classifiers[clf_filename_base]) == repr(clf)
            assert replace_final_dict_values(res.json, value=0) in [{
                'feature_weights': {
                    'I': {},
                    'II': {},
                    'III': {}
                },
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }, {
                'feature_weights': {},
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }]
    finally:
        utils.remove_and_check(vectorizer_path)
        for clf_filename in clfs_filenames:
            utils.remove_and_check(clf_filename)
        prediction_server._text_field = None
        prediction_server._class_field = None
        prediction_server._preprocessor = None
        prediction_server._feature_extractor = None
        prediction_server._feature_weights = dict()
        prediction_server._classifiers = dict()
        prediction_server._old_handlers = dict()