def test_prepare(capsys): text_field = 'text field' class_field = 'class field' quantity = 2 fields = {text_field: 'Teste value.', class_field: 'c1'} analyzed_sentences = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * quantity } docs1 = [ Document(index=0, fields=fields, analyzed_sentences=analyzed_sentences), Document(index=1, fields=fields, analyzed_sentences=None), ] synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt'] expected_corpus_str = [[' '.join(['teste value'] * quantity), ''], [' '.join(['prova value'] * quantity), '']] expected_classifications = [[fields[class_field]] * quantity ] * len(synonyms_files) expected_idxs_to_remove = [[1]] * len(synonyms_files) expected_corpus = [[['teste', 'value'] * quantity, []], [['prova', 'value'] * quantity, []]] try: filename = generate_available_filename() pickle_manager.dump_documents(docs1, filename) for i, synonyms_file in enumerate(synonyms_files): ft = FeatureExtractor(synonyms_file=synonyms_file) for training_mode in [True, False]: corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare( text_field, class_field, None, docs1, training_mode) corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare( text_field, class_field, filename, None, training_mode) assert (corpus_str1, classifications1, idxs_to_remove1, corpus1) == (corpus_str2, classifications2, idxs_to_remove2, corpus2) assert corpus_str1 == expected_corpus_str[i] assert classifications1 == expected_classifications[i] assert idxs_to_remove1 == expected_idxs_to_remove[i] assert corpus1 == expected_corpus[i] captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Preparing to create classification: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') if synonyms_file is not None: remove_and_check(synonyms_file) finally: remove_and_check(filename)
def test__generate_file(): try: filename = pickle_manager._generate_file() assert exists(filename) assert pytest.approx(getctime(filename)) == pytest.approx(time()) finally: remove_and_check(filename)
def test_dump_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) try: path = create_temporary_file(content=None, text=False) with pytest.raises(Exception): pickle_manager.dump_documents(docs1, path) finally: remove_and_check(path) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata = pickle_manager.get_docs_metadata(filename) docs2 = list(pickle_manager.get_documents(filename)) assert len(metadata) == 1 assert metadata['total'] == len(docs1) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) finally: remove_and_check(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Storing documents: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n')
def test_check_data(capsys): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) pickle_manager.check_data(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Checking data: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') count = 10 metadata1 = {'total': count} pda1 = pickle_manager.PickleDumpAppend(metadata1, filename) for not_Document in range(count): pda1.dump_append(not_Document) pda1.close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) metadata2 = {'total': -1} pickle_manager.PickleDumpAppend(metadata2, filename).close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) finally: remove_and_check(filename)
def test_generate_report(): execution_info = pd.DataFrame.from_dict({ 'Start': [functions.get_local_time_str()], 'End': [functions.get_local_time_str()], }) parameters_dict = Parameters(utils.config_file).__dict__ predictions_dict = { 'y_true': ['label1'], 'classifier_key': [{'label1': 0.0, 'label2': 1.0}], } parameters_dict['set_num_accepted_probs'] = 1 expected_df_row0 = pd.concat([ execution_info, functions.parameters_to_data_frame(parameters_dict), functions.predictions_to_data_frame(predictions_dict, 1), ], axis=1) parameters_dict['set_num_accepted_probs'] = {1} excel_file1 = utils.generate_available_filename(ext='.xlsx') excel_file2 = utils.generate_available_filename(ext='.xlsx') expected_df = pd.DataFrame() try: for i, file_exists in enumerate([False, True]): assert exists(excel_file1) is file_exists df = functions.generate_report(execution_info, parameters_dict, predictions_dict, excel_file1) df.to_excel(excel_file2, index=False) assert df.shape == (i + 1, 44) expected_df = pd.concat([expected_df, expected_df_row0]) pd.util.testing.assert_frame_equal(df, expected_df) pd.util.testing.assert_frame_equal(pd.read_excel(excel_file1), pd.read_excel(excel_file2)) finally: utils.remove_and_check(excel_file1) utils.remove_and_check(excel_file2)
def test_dump_and_load(): obj1 = random() try: path = create_temporary_file(content=None, text=False) pickle_manager.dump(obj1, path) obj2 = pickle_manager.load(path) finally: remove_and_check(path) assert obj1 == obj2
def test_main(monkeypatch): parameters = Parameters(utils.config_file) with pytest.raises(SystemExit): prediction_server.main(parameters, 1024) with monkeypatch.context() as m: m.setattr("gevent.pywsgi.WSGIServer.serve_forever", lambda stop_timeout: None) try: vectorizer_file = 'vectorizer.pkl' dump( FeatureExtractor(vectorizer_name='TfidfVectorizer').vectorizer, vectorizer_file) assert prediction_server._old_handlers == dict() assert prediction_server.logger.disabled is False prediction_server.main(parameters, 1025) assert prediction_server.logger.disabled is True assert prediction_server._text_field == 'Example column' assert prediction_server._class_field == 'Classification column' assert prediction_server._preprocessor.mosestokenizer_language_code == 'en' assert prediction_server._preprocessor.store_data is False assert prediction_server._preprocessor.spell_checker is None #assert prediction_server._preprocessor.spell_checker.hunspell.max_threads == cpu_count() assert len(prediction_server._feature_extractor.stop_words) > 0 assert prediction_server._feature_extractor.feature_reduction is None assert prediction_server._feature_extractor.document_adjustment_code.__file__ == abspath( 'text_categorizer/document_updater.py') assert prediction_server._feature_extractor.synonyms is None assert prediction_server._feature_extractor.vectorizer_file == vectorizer_file assert prediction_server._feature_extractor.n_jobs == cpu_count() assert prediction_server._old_handlers == dict() m.setattr( "text_categorizer.prediction_server._reset_signal_handlers", lambda: None) prediction_server.main(parameters, 1025) assert len(prediction_server._old_handlers) == 1 prediction_server._old_handlers.clear() assert type(prediction_server.app.wsgi_app) is WSGIServer assert prediction_server.app.wsgi_app.started is False assert prediction_server.app.wsgi_app.closed is True for sig in constants.stop_signals: prediction_server.app.wsgi_app.start() assert prediction_server.app.wsgi_app.started is True prediction_server._signal_handler(sig=sig, frame=None) assert prediction_server.app.wsgi_app.closed is True for sig in constants.stop_signals * 2: prediction_server.app.wsgi_app.start() assert prediction_server.app.wsgi_app.started is True prediction_server._signal_handler(sig=sig, frame=None) assert prediction_server.app.wsgi_app.closed is True for sig in [signal.SIGILL]: assert sig not in constants.stop_signals prediction_server.app.wsgi_app.start() assert prediction_server.app.wsgi_app.started is True prediction_server._signal_handler(sig=sig, frame=None) assert prediction_server.app.wsgi_app.closed is False finally: utils.remove_and_check(vectorizer_file)
def test_dump_json(): d1 = {'test_random_values': [np.random.random()]} filename = utils.generate_available_filename() try: trainer.dump_json(d1, filename) f = open(filename, 'r') d2 = json.load(f) finally: f.close() utils.remove_and_check(filename) assert d1 == d2
def test_get_docs_metadata(): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) metadata = pickle_manager.get_docs_metadata(filename) finally: remove_and_check(filename) assert type(metadata) is dict assert len(metadata) == 1 assert metadata['total'] == len(docs)
def test_generate_roc_plot(): filename = '%s.png' % (generate_available_filename()) for n_class in [2, 10]: X_test, y_test = load_digits(n_class=n_class, return_X_y=True) for f in clfs: clf = f(n_jobs=1, class_weight=None) clf.fit(X_test, y_test) assert not exists(filename) try: classifiers.generate_roc_plot(clf, X_test, y_test, filename) assert exists(filename) finally: remove_and_check(filename)
def test_PickleDumpAppend_close(): metadata = {'total': 0} filename = generate_available_filename() try: for expected_value in [False, True]: assert exists(filename) == expected_value pda = pickle_manager.PickleDumpAppend(metadata, filename) assert not pda.file.closed assert exists(pda.file.name) pda.close() assert pda.file.closed assert not exists(pda.file.name) finally: remove_and_check(filename)
def test___init__(): filename = generate_available_filename() try: assert not exists(filename) parser = ContoPTParser(filename) assert exists(filename) synonyms = ContoPTParser._load_synonyms(filename) finally: remove_and_check(filename) assert type(parser.synonyms) is dict assert len(parser.synonyms) > 0 assert parser.synonyms == synonyms assert synonyms['adjudicatário'] == 'adjudicante' assert synonyms['melancolia'] == 'misantropia' assert synonyms['tristeza'] == 'misantropia'
def test_PickleDumpAppend_dump_append(): count = 10 metadata = {'total': 0} filename = generate_available_filename() try: pda = pickle_manager.PickleDumpAppend(metadata, filename) for i in range(count): pda.dump_append(i) pda.close() input_file = open(filename, 'rb') assert load(input_file) == metadata for i in range(count): data = load(input_file) assert data == i finally: input_file.close() remove_and_check(filename)
def test_main(): old_dir = os.getcwd() new_dir = utils.generate_available_filename() base_parameters = Parameters(utils.config_file) base_parameters.preprocessed_data_file = os.path.basename(base_parameters.preprocessed_data_file) try: os.makedirs(new_dir, exist_ok=False) os.chdir(new_dir) parameters = deepcopy(base_parameters) parameters.excel_file = "invalid_excel_file" parameters.preprocessed_data_file = "invalid_data_file" with pytest.raises(SystemExit): trainer.main(parameters) parameters = deepcopy(base_parameters) assert not os.path.exists(parameters.preprocessed_data_file) try: trainer.main(parameters) assert os.path.exists(parameters.preprocessed_data_file) assert os.path.exists("predictions.json") assert os.path.exists("report.xlsx") finally: utils.remove_and_check(parameters.preprocessed_data_file) utils.remove_and_check("predictions.json") utils.remove_and_check("report.xlsx") parameters.excel_file = os.path.abspath("20newsgroups") parameters.preprocess_data = False excel_file_20newsgroups = "20newsgroups.xlsx" assert not os.path.exists(excel_file_20newsgroups) try: trainer.main(parameters) pytest.fail() except SystemExit: assert os.path.exists(excel_file_20newsgroups) finally: utils.remove_and_check(excel_file_20newsgroups) parameters = deepcopy(base_parameters) parameters.final_training = True try: trainer.main(parameters) finally: assert not os.path.exists("predictions.json") assert not os.path.exists("report.xlsx") utils.remove_and_check(parameters.preprocessed_data_file) finally: os.chdir(old_dir) rmtree(new_dir)
def test_get_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: dump_documents(docs1, filename) for d1, d2 in [(None, '100%|'), ('Loading documents', 'Loading documents: 100%|')]: docs2 = list(get_documents(filename, description=d1)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2) assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') finally: remove_and_check(filename)
def test_PickleDumpAppend___init__(): metadata = {'total': 0} filename = generate_available_filename() try: not_dict = 'test_str' not_str = -1 params = [[not_dict, filename], [metadata, not_str]] for m, f in params: with pytest.raises(AssertionError): pda = pickle_manager.PickleDumpAppend(m, f) pda = pickle_manager.PickleDumpAppend(metadata, filename) assert pda.filename_upon_completion == filename assert exists(pda.file.name) pda.close() assert pickle_manager.load(filename) == metadata assert not exists(pda.file.name) assert exists(filename) finally: remove_and_check(filename)
def test_predictions_to_data_frame(): predictions_dict1 = { 'y_true': ['0', '1', '0', '1', '0'], 'RandomForestClassifier': [{'0': 1., '1': 0.}, {'0': 1., '1': 0.}, \ {'0': 0., '1': 1.}, {'0': 0., '1': 1.}, {'0': 1., '1': 0.}], 'LinearSVC': [{'0': 1., '1': 0.}, {'0': 0., '1': 1.}, \ {'0': 1., '1': 0.}, {'0': 0., '1': 1.}, {'0': 0., '1': 1.}] } columns = [ '%s %s %s' % (metric, clf, label) for metric in ['f1-score', 'precision', 'recall', 'support'] for clf in ['LinearSVC', 'RandomForestClassifier'] for label in [0, 1, 'macro avg', 'micro avg', 'weighted avg'] ] data1 = [ 0.8, 0.8, 0.8, 0.8000000000000002, 0.8, 0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6, 1.0, 0.6666666666666666, 0.8333333333333333, 0.8, 0.8666666666666666, 0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6, 0.6666666666666666, 1.0, 0.8333333333333333, 0.8, 0.8, 0.6666666666666666, 0.5, 0.5833333333333333, 0.6, 0.6, 3, 2, 5, 5, 5, 3, 2, 5, 5, 5 ] data2 = data1.copy() data2[0:30] = [1.] * 30 expected_df1 = pd.DataFrame(data=[data1], columns=columns) expected_df2 = pd.DataFrame(data=[data2], columns=columns) try: path = utils.create_temporary_file(content=None, text=True) trainer.dump_json(predictions_dict1, path) f = open(path, 'r') predictions_dict2 = json.load(f) f.close() finally: utils.remove_and_check(path) df1 = functions.predictions_to_data_frame(predictions_dict2, 1) df2 = functions.predictions_to_data_frame(predictions_dict2, 2) assert predictions_dict1 == predictions_dict2 pd.util.testing.assert_frame_equal(df1, expected_df1) pd.util.testing.assert_frame_equal(df2, expected_df2)
def test_load_20newsgroups(): p1 = Parameters(utils.config_file) p1.excel_file = '20newsgroups' excel_file = utils.generate_available_filename('.xlsx') try: p2 = trainer.load_20newsgroups(p1, excel_file) assert p1 is not p2 assert p1 != p2 assert p2.excel_column_with_text_data == 'data' assert p2.excel_column_with_classification_data == 'target' assert os.path.exists(excel_file) df = pd.read_excel(excel_file) assert df.shape == (18846, 3) assert list(df.keys()) == ['Unnamed: 0', 'data', 'target'] expected_mtime = os.path.getmtime(excel_file) p3 = trainer.load_20newsgroups(p1, excel_file) assert os.path.getmtime(excel_file) == expected_mtime assert p3.__dict__ == p2.__dict__ finally: utils.remove_and_check('20news-bydate_py3.pkz') utils.remove_and_check(excel_file)
def test_set_docs_metadata(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata1 = pickle_manager.get_docs_metadata(filename) metadata2 = metadata1.copy() metadata2['new_field'] = 'test_field_value' assert metadata1 != metadata2 pickle_manager.set_docs_metadata(metadata2, filename) assert pickle_manager.get_docs_metadata(filename) == metadata2 docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n' finally: remove_and_check(filename)
def test_LatentDirichletAllocation(): X = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y = np.asarray([0, 1, 2]) expected_new_X = np.asarray([[ 0.01428573, 0.87142845, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573 ], [ 0.00625001, 0.94374995, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001 ], [ 0.00400000, 0.96399997, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000 ]]) filename = generate_available_filename() assert not exists(filename) try: new_X1, new_y1 = FeatureExtractor.LatentDirichletAllocation( X, y, filename) assert exists(filename) assert new_X1.shape == (X.shape[0], 10) assert np.allclose(expected_new_X, new_X1) assert np.array_equal(y, new_y1) mtime = getmtime(filename) new_X2, new_y2 = FeatureExtractor.LatentDirichletAllocation( X, y, filename) assert getmtime(filename) == mtime assert np.array_equal(new_X1, new_X2) assert np.array_equal(new_y1, new_y2) finally: remove_and_check(filename)
def test_get_documents(): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) try: f = open(filename, 'ab') dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol) f.close() docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) pytest.fail() except Exception as e: assert len(e.args) == 1 assert e.args[ 0] == "The file '%s' has more documents than indicated in the metadata." % ( filename) finally: remove_and_check(filename)
def test_generate_X_y(capsys): quantity = 2 corpus = ['Test lemma 1 . ' * quantity, 'Test lemma 2 . ' * quantity] classifications = [1, 2] filename = generate_available_filename() dpe_out = 'Please, ignore the message above indicating that the sentence is too long. The problem has been solved.\n' * 6 combinations = [ ('CountVectorizer', None, True, ''), ('CountVectorizer', 'LDA', True, ''), ('CountVectorizer', 'MDS', True, ''), ('HashingVectorizer', None, True, ''), ('HashingVectorizer', 'MDS', True, ''), ('TfidfVectorizer', None, True, ''), ('TfidfVectorizer', 'LDA', True, ''), ('TfidfVectorizer', 'MDS', True, ''), ('DocumentPoolEmbeddings', None, False, dpe_out), ('DocumentPoolEmbeddings', 'MDS', False, dpe_out), ] for vectorizer, fr, expect_file, expected_out in combinations: try: ft = FeatureExtractor(vectorizer_name=vectorizer, feature_reduction=fr, vectorizer_file=filename) for training_mode in [True, False]: assert exists(filename) is (not training_mode and expect_file) _X, y = ft.generate_X_y(corpus, classifications, training_mode) assert exists(filename) is expect_file assert y == classifications captured = capsys.readouterr() assert captured.out == expected_out assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Extracting features: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') finally: if expect_file: remove_and_check(filename) if fr == 'LDA': remove_and_check('LatentDirichletAllocation.pkl') with pytest.raises(ValueError): FeatureExtractor(feature_reduction='invalid', vectorizer_file=filename).generate_X_y( corpus, classifications) remove_and_check(filename)
def test_Pipeline_start(): predict_probas_linux = { 'RandomForestClassifier': [ [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], ], 'BernoulliNB': [ [1.0, 5.9253907982022474e-18, 9.24592247679012e-21], [5.086117678607322e-14, 0.9999999417850541, 5.821489476394197e-08], ], 'MultinomialNB': [ [1.0, 3.987155612430403e-87, 1.9843977254102716e-103], [1.1638109881136655e-141, 1.0, 4.902906597402722e-42], ], 'ComplementNB': [ [1.0, 1.244018908413837e-57, 2.372151728763692e-55], [1.2983800585685595e-35, 1.0, 3.836692075297123e-24], ], 'KNeighborsClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], 'MLPClassifier': [ [0.9999992330465266, 2.108350674827178e-08, 7.458699665987544e-07], [6.949799904570786e-10, 0.9999171940556058, 8.280524941418183e-05], ], 'LinearSVC': [ [0.8995782143576087, 0.02511044323694783, 0.07531134240544347], [0.03561932795252063, 0.9407083426933305, 0.023672329354149018], ], 'DecisionTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], 'ExtraTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], 'DummyClassifier': [[0, 0, 1], [1, 0, 0]], 'SGDClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], 'BaggingClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], } predict_probas_windows = { 'ComplementNB': [ [1.0, 1.2440189084141198e-57, 2.3721517287642315e-55], [1.2983800585685595e-35, 1.0, 3.836692075297123e-24], ], 'MLPClassifier': [ [0.9999992330465266, 2.108350674827178e-08, 7.458699665987557e-07], [6.949799904570761e-10, 0.9999171940556058, 8.280524941418183e-05], ], 'LinearSVC': [ [0.8995692949536029, 0.025113499736912265, 0.07531720530948487], [0.0356197780956943, 0.9407082394988142, 0.02367198240549154], ], } p = classifiers.Pipeline(clfs) clfs_names = [f.__name__ for f in p.classifiers] clfs_files = ['%s.pkl' % (clf_name) for clf_name in clfs_names] roc_files = ['ROC_%s.png' % (clf_name) for clf_name in clfs_names] X, y = load_digits(n_class=3, return_X_y=True) y = y.tolist() assert all([not exists(clf_name) for clf_name in clfs_names]) try: predictions = p.start(X, y, X, y, -1, {1, 2, 2, 4}) for clf_name, clf_file in zip_longest(clfs_names, clfs_files): predict_proba = [list(d.values()) for d in predictions[clf_name][0:2]] assert np.array_equal(predict_probas_linux[clf_name], predict_proba) \ or np.array_equal(predict_probas_windows[clf_name], predict_proba) assert exists(clf_file) clf = pickle_manager.load(clf_file) if 'n_jobs' in dir(clf): assert clf.n_jobs == -1 if 'class_weights' in dir(clf): assert clf.class_weights is None assert all([not exists(roc_file) for roc_file in roc_files]) p.start(X, y, X, y, -1, {1, 2, 2, 4}, 'balanced') for clf_file in clfs_files: clf = pickle_manager.load(clf_file) if 'class_weights' in dir(clf): assert clf.class_weights == 'balanced' p.start(X, y, X, y, -1, {1, 2, 2, 4}, None, True) assert all([exists(roc_file) for roc_file in roc_files]) classifiers.Pipeline([FailClassifier]).start(X, y, X, y) predictions = p.start(X, y) assert predictions == {'y_true': []} with pytest.raises(AssertionError): p.start(X, y, X, []) finally: for clf_file in clfs_files: remove_and_check(clf_file) for roc_file in roc_files: remove_and_check(roc_file)
def test_predict(client): df = read_excel(utils.example_excel_file) docs = data_frame_to_document_list(df) prediction_server._text_field = 'Example column' prediction_server._class_field = 'Classification column' clfs_filenames = [] try: vectorizer_path = utils.create_temporary_file(content=None, text=False) p = Preprocessor() p.preprocess(text_field=prediction_server._text_field, preprocessed_data_file=None, docs=docs) ft = FeatureExtractor(training_mode=True, vectorizer_file=vectorizer_path) corpus, classifications, _, _ = ft.prepare( text_field=prediction_server._text_field, class_field=prediction_server._class_field, preprocessed_data_file=None, docs=docs, training_mode=True) X, y = ft.generate_X_y(corpus, classifications, training_mode=True) prediction_server._preprocessor = Preprocessor() prediction_server._feature_extractor = FeatureExtractor( training_mode=False, vectorizer_file=vectorizer_path) res = client.post('/', headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST res = client.post('/', json={ 'text': 1, 'classifier': 'LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid text</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': 1 }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': '../LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': 'LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode( res.data).endswith('<p>Invalid classifier model</p>\n') for f in clfs: clf_filename_base = utils.generate_available_filename() clf_filename = '%s.pkl' % (clf_filename_base) clfs_filenames.append(clf_filename) clf = f(n_jobs=1, class_weight=None) clf.fit(X, y) dump(clf, clf_filename) res = client.post('/', json={ 'text': 'Test text.', 'classifier': clf_filename_base }, headers=valid_headers) assert res.status_code == 200 assert repr( prediction_server._classifiers[clf_filename_base]) == repr(clf) assert replace_final_dict_values(res.json, value=0) in [{ 'feature_weights': { 'I': {}, 'II': {}, 'III': {} }, 'probabilities': { 'I': 0, 'II': 0, 'III': 0 } }, { 'feature_weights': {}, 'probabilities': { 'I': 0, 'II': 0, 'III': 0 } }] finally: utils.remove_and_check(vectorizer_path) for clf_filename in clfs_filenames: utils.remove_and_check(clf_filename) prediction_server._text_field = None prediction_server._class_field = None prediction_server._preprocessor = None prediction_server._feature_extractor = None prediction_server._feature_weights = dict() prediction_server._classifiers = dict() prediction_server._old_handlers = dict()
def test_train_test_split(): text_field = 'Example column' df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) preprocessor = Preprocessor() preprocessor.preprocess(text_field, None, docs) ft = FeatureExtractor() corpus, classifications, _, _ = ft.prepare( text_field=text_field, class_field='Classification column', preprocessed_data_file=None, docs=docs, training_mode=False) test_size = 0.3 preprocessed_data_file = generate_available_filename() force = False idxs_to_remove = [5] try: pickle_manager.dump_documents(docs, preprocessed_data_file) assert pickle_manager.get_docs_metadata(preprocessed_data_file) == { 'total': 10 } desired = { 'total': 10, 'test_size': test_size, 'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]), 'test_set_indexes': np.array([7, 9, 4]) } for my_force in [False, True]: train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, my_force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key in ['test_size', 'training_set_indexes', 'test_set_indexes']: m = desired.copy() m[key] = None pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key, value in [('test_size', 0.2), ('training_set_indexes', np.array([1, 0, 2, 8, 3]))]: m = desired.copy() m[key] = value pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), m) finally: remove_and_check(preprocessed_data_file) pass
def test___init__(): ft1 = FeatureExtractor() try: nltk.data.find('corpora/stopwords') except LookupError: pytest.fail() assert ft1.stop_words == set() assert ft1.vectorizer_file == 'vectorizer.pkl' assert type(ft1.vectorizer) is feature_extraction.text.TfidfVectorizer assert ft1.feature_reduction is None assert 'initial_code_to_run_on_document' in dir( ft1.document_adjustment_code) assert ft1.upostags_to_ignore == ['PUNCT'] assert ft1.synonyms is None assert ft1.n_jobs == 1 ft2 = FeatureExtractor(nltk_stop_words_package='english') assert ft2.stop_words == set(nltk.corpus.stopwords.words('english')) for vectorizer_name in [ 'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer' ]: with pytest.raises(FileNotFoundError): FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=False, vectorizer_file=generate_available_filename()) try: path = create_temporary_file(content=None, text=False) pickle_manager.dump( FeatureExtractor(vectorizer_name=vectorizer_name).vectorizer, path) ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=False, vectorizer_file=path) assert ft.vectorizer.__class__.__name__ == vectorizer_name finally: remove_and_check(path) ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=True) assert ft.vectorizer.__class__.__name__ == vectorizer_name for vectorizer_name in ['DocumentPoolEmbeddings']: for training_mode in [True, False]: vectorizer_file = generate_available_filename() ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=training_mode, vectorizer_file=vectorizer_file) assert ft.vectorizer.__class__.__name__ == vectorizer_name assert not exists(vectorizer_file) with pytest.raises(ValueError): FeatureExtractor(vectorizer_name='invalid_vectorizer', training_mode=True) ft3 = FeatureExtractor(remove_adjectives=True) assert ft3.upostags_to_ignore == ['PUNCT', 'ADJ'] synonyms_file = 'contopt_0.1_r2_c0.0.txt' filename = generate_available_filename() try: ft4 = FeatureExtractor(synonyms_file=synonyms_file) contoPTParser = ContoPTParser(filename) assert ft4.synonyms == contoPTParser.synonyms finally: remove_and_check(synonyms_file) remove_and_check(filename) with pytest.raises(ValueError): FeatureExtractor(synonyms_file='invalid_file.txt') ft5 = FeatureExtractor(n_jobs=2) assert ft5.n_jobs == 2
def test_preprocess(capsys): text_field = 'Test field' index = -1 fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2} analyzed_sentences1 = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tikens', 'lemma': 'tikens', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } analyzed_sentences2 = { text_field: [[{ 'form': 'Test', 'lemma': 'test', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tokens', 'lemma': 'token', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1), ('en_US', analyzed_sentences2)]: doc = Document(index=index, fields=fields, analyzed_sentences=dict()) p = Preprocessor(spell_checker_lang=spell_checker_lang) assert p.stop is False p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) assert p.stop is False assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Preprocessing: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') p.stop = True with pytest.raises(SystemExit): p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) del (p) if spell_checker_lang is not None: rmtree('./hunspell') docs = [ Document(index=index, fields=fields, analyzed_sentences=dict()) for index in range(2) ] preprocessed_data_file = utils.generate_available_filename() try: pickle_manager.dump_documents(docs, preprocessed_data_file) pickle_manager.check_data(preprocessed_data_file) p = Preprocessor(store_data=True) assert all([ doc.analyzed_sentences == dict() for doc in pickle_manager.get_documents(preprocessed_data_file) ]) p.preprocess(text_field, preprocessed_data_file, None) assert all([ doc.analyzed_sentences == analyzed_sentences1 for doc in pickle_manager.get_documents(preprocessed_data_file) ]) pickle_manager.check_data(preprocessed_data_file) finally: utils.remove_and_check(preprocessed_data_file)