def test_generate_report(): execution_info = pd.DataFrame.from_dict({ 'Start': [functions.get_local_time_str()], 'End': [functions.get_local_time_str()], }) parameters_dict = Parameters(utils.config_file).__dict__ predictions_dict = { 'y_true': ['label1'], 'classifier_key': [{'label1': 0.0, 'label2': 1.0}], } parameters_dict['set_num_accepted_probs'] = 1 expected_df_row0 = pd.concat([ execution_info, functions.parameters_to_data_frame(parameters_dict), functions.predictions_to_data_frame(predictions_dict, 1), ], axis=1) parameters_dict['set_num_accepted_probs'] = {1} excel_file1 = utils.generate_available_filename(ext='.xlsx') excel_file2 = utils.generate_available_filename(ext='.xlsx') expected_df = pd.DataFrame() try: for i, file_exists in enumerate([False, True]): assert exists(excel_file1) is file_exists df = functions.generate_report(execution_info, parameters_dict, predictions_dict, excel_file1) df.to_excel(excel_file2, index=False) assert df.shape == (i + 1, 44) expected_df = pd.concat([expected_df, expected_df_row0]) pd.util.testing.assert_frame_equal(df, expected_df) pd.util.testing.assert_frame_equal(pd.read_excel(excel_file1), pd.read_excel(excel_file2)) finally: utils.remove_and_check(excel_file1) utils.remove_and_check(excel_file2)
def test_dump_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) try: path = create_temporary_file(content=None, text=False) with pytest.raises(Exception): pickle_manager.dump_documents(docs1, path) finally: remove_and_check(path) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata = pickle_manager.get_docs_metadata(filename) docs2 = list(pickle_manager.get_documents(filename)) assert len(metadata) == 1 assert metadata['total'] == len(docs1) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) finally: remove_and_check(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Storing documents: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n')
def test_prepare(capsys): text_field = 'text field' class_field = 'class field' quantity = 2 fields = {text_field: 'Teste value.', class_field: 'c1'} analyzed_sentences = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * quantity } docs1 = [ Document(index=0, fields=fields, analyzed_sentences=analyzed_sentences), Document(index=1, fields=fields, analyzed_sentences=None), ] synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt'] expected_corpus_str = [[' '.join(['teste value'] * quantity), ''], [' '.join(['prova value'] * quantity), '']] expected_classifications = [[fields[class_field]] * quantity ] * len(synonyms_files) expected_idxs_to_remove = [[1]] * len(synonyms_files) expected_corpus = [[['teste', 'value'] * quantity, []], [['prova', 'value'] * quantity, []]] try: filename = generate_available_filename() pickle_manager.dump_documents(docs1, filename) for i, synonyms_file in enumerate(synonyms_files): ft = FeatureExtractor(synonyms_file=synonyms_file) for training_mode in [True, False]: corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare( text_field, class_field, None, docs1, training_mode) corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare( text_field, class_field, filename, None, training_mode) assert (corpus_str1, classifications1, idxs_to_remove1, corpus1) == (corpus_str2, classifications2, idxs_to_remove2, corpus2) assert corpus_str1 == expected_corpus_str[i] assert classifications1 == expected_classifications[i] assert idxs_to_remove1 == expected_idxs_to_remove[i] assert corpus1 == expected_corpus[i] captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Preparing to create classification: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') if synonyms_file is not None: remove_and_check(synonyms_file) finally: remove_and_check(filename)
def test_check_data(capsys): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) pickle_manager.check_data(filename) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Checking data: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') count = 10 metadata1 = {'total': count} pda1 = pickle_manager.PickleDumpAppend(metadata1, filename) for not_Document in range(count): pda1.dump_append(not_Document) pda1.close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) metadata2 = {'total': -1} pickle_manager.PickleDumpAppend(metadata2, filename).close() with pytest.raises(AssertionError): pickle_manager.check_data(filename) finally: remove_and_check(filename)
def test_dump_json(): d1 = {'test_random_values': [np.random.random()]} filename = utils.generate_available_filename() try: trainer.dump_json(d1, filename) f = open(filename, 'r') d2 = json.load(f) finally: f.close() utils.remove_and_check(filename) assert d1 == d2
def test_get_docs_metadata(): df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs, filename) metadata = pickle_manager.get_docs_metadata(filename) finally: remove_and_check(filename) assert type(metadata) is dict assert len(metadata) == 1 assert metadata['total'] == len(docs)
def test_generate_roc_plot(): filename = '%s.png' % (generate_available_filename()) for n_class in [2, 10]: X_test, y_test = load_digits(n_class=n_class, return_X_y=True) for f in clfs: clf = f(n_jobs=1, class_weight=None) clf.fit(X_test, y_test) assert not exists(filename) try: classifiers.generate_roc_plot(clf, X_test, y_test, filename) assert exists(filename) finally: remove_and_check(filename)
def test_PickleDumpAppend_close(): metadata = {'total': 0} filename = generate_available_filename() try: for expected_value in [False, True]: assert exists(filename) == expected_value pda = pickle_manager.PickleDumpAppend(metadata, filename) assert not pda.file.closed assert exists(pda.file.name) pda.close() assert pda.file.closed assert not exists(pda.file.name) finally: remove_and_check(filename)
def test___init__(): filename = generate_available_filename() try: assert not exists(filename) parser = ContoPTParser(filename) assert exists(filename) synonyms = ContoPTParser._load_synonyms(filename) finally: remove_and_check(filename) assert type(parser.synonyms) is dict assert len(parser.synonyms) > 0 assert parser.synonyms == synonyms assert synonyms['adjudicatário'] == 'adjudicante' assert synonyms['melancolia'] == 'misantropia' assert synonyms['tristeza'] == 'misantropia'
def test_main(): old_dir = os.getcwd() new_dir = utils.generate_available_filename() base_parameters = Parameters(utils.config_file) base_parameters.preprocessed_data_file = os.path.basename(base_parameters.preprocessed_data_file) try: os.makedirs(new_dir, exist_ok=False) os.chdir(new_dir) parameters = deepcopy(base_parameters) parameters.excel_file = "invalid_excel_file" parameters.preprocessed_data_file = "invalid_data_file" with pytest.raises(SystemExit): trainer.main(parameters) parameters = deepcopy(base_parameters) assert not os.path.exists(parameters.preprocessed_data_file) try: trainer.main(parameters) assert os.path.exists(parameters.preprocessed_data_file) assert os.path.exists("predictions.json") assert os.path.exists("report.xlsx") finally: utils.remove_and_check(parameters.preprocessed_data_file) utils.remove_and_check("predictions.json") utils.remove_and_check("report.xlsx") parameters.excel_file = os.path.abspath("20newsgroups") parameters.preprocess_data = False excel_file_20newsgroups = "20newsgroups.xlsx" assert not os.path.exists(excel_file_20newsgroups) try: trainer.main(parameters) pytest.fail() except SystemExit: assert os.path.exists(excel_file_20newsgroups) finally: utils.remove_and_check(excel_file_20newsgroups) parameters = deepcopy(base_parameters) parameters.final_training = True try: trainer.main(parameters) finally: assert not os.path.exists("predictions.json") assert not os.path.exists("report.xlsx") utils.remove_and_check(parameters.preprocessed_data_file) finally: os.chdir(old_dir) rmtree(new_dir)
def test_PickleDumpAppend_dump_append(): count = 10 metadata = {'total': 0} filename = generate_available_filename() try: pda = pickle_manager.PickleDumpAppend(metadata, filename) for i in range(count): pda.dump_append(i) pda.close() input_file = open(filename, 'rb') assert load(input_file) == metadata for i in range(count): data = load(input_file) assert data == i finally: input_file.close() remove_and_check(filename)
def test_get_documents(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: dump_documents(docs1, filename) for d1, d2 in [(None, '100%|'), ('Loading documents', 'Loading documents: 100%|')]: docs2 = list(get_documents(filename, description=d1)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2) assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') finally: remove_and_check(filename)
def test_generate_X_y(capsys): quantity = 2 corpus = ['Test lemma 1 . ' * quantity, 'Test lemma 2 . ' * quantity] classifications = [1, 2] filename = generate_available_filename() dpe_out = 'Please, ignore the message above indicating that the sentence is too long. The problem has been solved.\n' * 6 combinations = [ ('CountVectorizer', None, True, ''), ('CountVectorizer', 'LDA', True, ''), ('CountVectorizer', 'MDS', True, ''), ('HashingVectorizer', None, True, ''), ('HashingVectorizer', 'MDS', True, ''), ('TfidfVectorizer', None, True, ''), ('TfidfVectorizer', 'LDA', True, ''), ('TfidfVectorizer', 'MDS', True, ''), ('DocumentPoolEmbeddings', None, False, dpe_out), ('DocumentPoolEmbeddings', 'MDS', False, dpe_out), ] for vectorizer, fr, expect_file, expected_out in combinations: try: ft = FeatureExtractor(vectorizer_name=vectorizer, feature_reduction=fr, vectorizer_file=filename) for training_mode in [True, False]: assert exists(filename) is (not training_mode and expect_file) _X, y = ft.generate_X_y(corpus, classifications, training_mode) assert exists(filename) is expect_file assert y == classifications captured = capsys.readouterr() assert captured.out == expected_out assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Extracting features: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') finally: if expect_file: remove_and_check(filename) if fr == 'LDA': remove_and_check('LatentDirichletAllocation.pkl') with pytest.raises(ValueError): FeatureExtractor(feature_reduction='invalid', vectorizer_file=filename).generate_X_y( corpus, classifications) remove_and_check(filename)
def test_PickleDumpAppend___init__(): metadata = {'total': 0} filename = generate_available_filename() try: not_dict = 'test_str' not_str = -1 params = [[not_dict, filename], [metadata, not_str]] for m, f in params: with pytest.raises(AssertionError): pda = pickle_manager.PickleDumpAppend(m, f) pda = pickle_manager.PickleDumpAppend(metadata, filename) assert pda.filename_upon_completion == filename assert exists(pda.file.name) pda.close() assert pickle_manager.load(filename) == metadata assert not exists(pda.file.name) assert exists(filename) finally: remove_and_check(filename)
def test_get_dict(): for data_dir in [mkdtemp(), generate_available_filename()]: try: for lang in ['en_US', 'pt_PT']: aff_file = os.path.join(data_dir, '%s.aff' % (lang)) dic_file = os.path.join(data_dir, '%s.dic' % (lang)) assert not os.path.exists(aff_file) assert not os.path.exists(dic_file) SpellChecker.get_dict(lang, data_dir) assert os.path.exists(aff_file) assert os.path.exists(dic_file) aff_mtime = os.path.getmtime(aff_file) dic_mtime = os.path.getmtime(dic_file) SpellChecker.get_dict(lang, data_dir) assert aff_mtime == os.path.getmtime(aff_file) assert dic_mtime == os.path.getmtime(dic_file) with pytest.raises(requests.HTTPError): SpellChecker.get_dict('pt_NotExists', data_dir) finally: rmtree(data_dir)
def test_set_docs_metadata(capsys): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) metadata1 = pickle_manager.get_docs_metadata(filename) metadata2 = metadata1.copy() metadata2['new_field'] = 'test_field_value' assert metadata1 != metadata2 pickle_manager.set_docs_metadata(metadata2, filename) assert pickle_manager.get_docs_metadata(filename) == metadata2 docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n' finally: remove_and_check(filename)
def test_load_20newsgroups(): p1 = Parameters(utils.config_file) p1.excel_file = '20newsgroups' excel_file = utils.generate_available_filename('.xlsx') try: p2 = trainer.load_20newsgroups(p1, excel_file) assert p1 is not p2 assert p1 != p2 assert p2.excel_column_with_text_data == 'data' assert p2.excel_column_with_classification_data == 'target' assert os.path.exists(excel_file) df = pd.read_excel(excel_file) assert df.shape == (18846, 3) assert list(df.keys()) == ['Unnamed: 0', 'data', 'target'] expected_mtime = os.path.getmtime(excel_file) p3 = trainer.load_20newsgroups(p1, excel_file) assert os.path.getmtime(excel_file) == expected_mtime assert p3.__dict__ == p2.__dict__ finally: utils.remove_and_check('20news-bydate_py3.pkz') utils.remove_and_check(excel_file)
def test_LatentDirichletAllocation(): X = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y = np.asarray([0, 1, 2]) expected_new_X = np.asarray([[ 0.01428573, 0.87142845, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573, 0.01428573 ], [ 0.00625001, 0.94374995, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001, 0.00625001 ], [ 0.00400000, 0.96399997, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000, 0.00400000 ]]) filename = generate_available_filename() assert not exists(filename) try: new_X1, new_y1 = FeatureExtractor.LatentDirichletAllocation( X, y, filename) assert exists(filename) assert new_X1.shape == (X.shape[0], 10) assert np.allclose(expected_new_X, new_X1) assert np.array_equal(y, new_y1) mtime = getmtime(filename) new_X2, new_y2 = FeatureExtractor.LatentDirichletAllocation( X, y, filename) assert getmtime(filename) == mtime assert np.array_equal(new_X1, new_X2) assert np.array_equal(new_y1, new_y2) finally: remove_and_check(filename)
def test_get_documents(): df = read_excel(example_excel_file) docs1 = data_frame_to_document_list(df) filename = generate_available_filename() try: pickle_manager.dump_documents(docs1, filename) docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) try: f = open(filename, 'ab') dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol) f.close() docs2 = list(pickle_manager.get_documents(filename)) for doc1, doc2 in zip_longest(docs1, docs2): assert repr(doc1) == repr(doc2) pytest.fail() except Exception as e: assert len(e.args) == 1 assert e.args[ 0] == "The file '%s' has more documents than indicated in the metadata." % ( filename) finally: remove_and_check(filename)
def test_predict(client): df = read_excel(utils.example_excel_file) docs = data_frame_to_document_list(df) prediction_server._text_field = 'Example column' prediction_server._class_field = 'Classification column' clfs_filenames = [] try: vectorizer_path = utils.create_temporary_file(content=None, text=False) p = Preprocessor() p.preprocess(text_field=prediction_server._text_field, preprocessed_data_file=None, docs=docs) ft = FeatureExtractor(training_mode=True, vectorizer_file=vectorizer_path) corpus, classifications, _, _ = ft.prepare( text_field=prediction_server._text_field, class_field=prediction_server._class_field, preprocessed_data_file=None, docs=docs, training_mode=True) X, y = ft.generate_X_y(corpus, classifications, training_mode=True) prediction_server._preprocessor = Preprocessor() prediction_server._feature_extractor = FeatureExtractor( training_mode=False, vectorizer_file=vectorizer_path) res = client.post('/', headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST res = client.post('/', json={ 'text': 1, 'classifier': 'LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid text</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': 1 }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': '../LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n') res = client.post('/', json={ 'text': 'Test text.', 'classifier': 'LinearSVC' }, headers=valid_headers) assert res.status_code == prediction_server.BAD_REQUEST assert utils.decode( res.data).endswith('<p>Invalid classifier model</p>\n') for f in clfs: clf_filename_base = utils.generate_available_filename() clf_filename = '%s.pkl' % (clf_filename_base) clfs_filenames.append(clf_filename) clf = f(n_jobs=1, class_weight=None) clf.fit(X, y) dump(clf, clf_filename) res = client.post('/', json={ 'text': 'Test text.', 'classifier': clf_filename_base }, headers=valid_headers) assert res.status_code == 200 assert repr( prediction_server._classifiers[clf_filename_base]) == repr(clf) assert replace_final_dict_values(res.json, value=0) in [{ 'feature_weights': { 'I': {}, 'II': {}, 'III': {} }, 'probabilities': { 'I': 0, 'II': 0, 'III': 0 } }, { 'feature_weights': {}, 'probabilities': { 'I': 0, 'II': 0, 'III': 0 } }] finally: utils.remove_and_check(vectorizer_path) for clf_filename in clfs_filenames: utils.remove_and_check(clf_filename) prediction_server._text_field = None prediction_server._class_field = None prediction_server._preprocessor = None prediction_server._feature_extractor = None prediction_server._feature_weights = dict() prediction_server._classifiers = dict() prediction_server._old_handlers = dict()
def test_preprocess(capsys): text_field = 'Test field' index = -1 fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2} analyzed_sentences1 = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tikens', 'lemma': 'tikens', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } analyzed_sentences2 = { text_field: [[{ 'form': 'Test', 'lemma': 'test', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tokens', 'lemma': 'token', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1), ('en_US', analyzed_sentences2)]: doc = Document(index=index, fields=fields, analyzed_sentences=dict()) p = Preprocessor(spell_checker_lang=spell_checker_lang) assert p.stop is False p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) assert p.stop is False assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Preprocessing: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') p.stop = True with pytest.raises(SystemExit): p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) del (p) if spell_checker_lang is not None: rmtree('./hunspell') docs = [ Document(index=index, fields=fields, analyzed_sentences=dict()) for index in range(2) ] preprocessed_data_file = utils.generate_available_filename() try: pickle_manager.dump_documents(docs, preprocessed_data_file) pickle_manager.check_data(preprocessed_data_file) p = Preprocessor(store_data=True) assert all([ doc.analyzed_sentences == dict() for doc in pickle_manager.get_documents(preprocessed_data_file) ]) p.preprocess(text_field, preprocessed_data_file, None) assert all([ doc.analyzed_sentences == analyzed_sentences1 for doc in pickle_manager.get_documents(preprocessed_data_file) ]) pickle_manager.check_data(preprocessed_data_file) finally: utils.remove_and_check(preprocessed_data_file)
def test___init__(): ft1 = FeatureExtractor() try: nltk.data.find('corpora/stopwords') except LookupError: pytest.fail() assert ft1.stop_words == set() assert ft1.vectorizer_file == 'vectorizer.pkl' assert type(ft1.vectorizer) is feature_extraction.text.TfidfVectorizer assert ft1.feature_reduction is None assert 'initial_code_to_run_on_document' in dir( ft1.document_adjustment_code) assert ft1.upostags_to_ignore == ['PUNCT'] assert ft1.synonyms is None assert ft1.n_jobs == 1 ft2 = FeatureExtractor(nltk_stop_words_package='english') assert ft2.stop_words == set(nltk.corpus.stopwords.words('english')) for vectorizer_name in [ 'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer' ]: with pytest.raises(FileNotFoundError): FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=False, vectorizer_file=generate_available_filename()) try: path = create_temporary_file(content=None, text=False) pickle_manager.dump( FeatureExtractor(vectorizer_name=vectorizer_name).vectorizer, path) ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=False, vectorizer_file=path) assert ft.vectorizer.__class__.__name__ == vectorizer_name finally: remove_and_check(path) ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=True) assert ft.vectorizer.__class__.__name__ == vectorizer_name for vectorizer_name in ['DocumentPoolEmbeddings']: for training_mode in [True, False]: vectorizer_file = generate_available_filename() ft = FeatureExtractor(vectorizer_name=vectorizer_name, training_mode=training_mode, vectorizer_file=vectorizer_file) assert ft.vectorizer.__class__.__name__ == vectorizer_name assert not exists(vectorizer_file) with pytest.raises(ValueError): FeatureExtractor(vectorizer_name='invalid_vectorizer', training_mode=True) ft3 = FeatureExtractor(remove_adjectives=True) assert ft3.upostags_to_ignore == ['PUNCT', 'ADJ'] synonyms_file = 'contopt_0.1_r2_c0.0.txt' filename = generate_available_filename() try: ft4 = FeatureExtractor(synonyms_file=synonyms_file) contoPTParser = ContoPTParser(filename) assert ft4.synonyms == contoPTParser.synonyms finally: remove_and_check(synonyms_file) remove_and_check(filename) with pytest.raises(ValueError): FeatureExtractor(synonyms_file='invalid_file.txt') ft5 = FeatureExtractor(n_jobs=2) assert ft5.n_jobs == 2
def test_train_test_split(): text_field = 'Example column' df = read_excel(example_excel_file) docs = data_frame_to_document_list(df) preprocessor = Preprocessor() preprocessor.preprocess(text_field, None, docs) ft = FeatureExtractor() corpus, classifications, _, _ = ft.prepare( text_field=text_field, class_field='Classification column', preprocessed_data_file=None, docs=docs, training_mode=False) test_size = 0.3 preprocessed_data_file = generate_available_filename() force = False idxs_to_remove = [5] try: pickle_manager.dump_documents(docs, preprocessed_data_file) assert pickle_manager.get_docs_metadata(preprocessed_data_file) == { 'total': 10 } desired = { 'total': 10, 'test_size': test_size, 'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]), 'test_set_indexes': np.array([7, 9, 4]) } for my_force in [False, True]: train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, my_force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key in ['test_size', 'training_set_indexes', 'test_set_indexes']: m = desired.copy() m[key] = None pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), desired) for key, value in [('test_size', 0.2), ('training_set_indexes', np.array([1, 0, 2, 8, 3]))]: m = desired.copy() m[key] = value pickle_manager.set_docs_metadata(m, preprocessed_data_file) train_test_split.train_test_split(corpus, classifications, test_size, preprocessed_data_file, force, idxs_to_remove) np.testing.assert_equal( pickle_manager.get_docs_metadata(preprocessed_data_file), m) finally: remove_and_check(preprocessed_data_file) pass