def test_generate_report():
    execution_info = pd.DataFrame.from_dict({
        'Start': [functions.get_local_time_str()],
        'End': [functions.get_local_time_str()],
    })
    parameters_dict = Parameters(utils.config_file).__dict__
    predictions_dict = {
        'y_true': ['label1'],
        'classifier_key': [{'label1': 0.0, 'label2': 1.0}],
    }
    parameters_dict['set_num_accepted_probs'] = 1
    expected_df_row0 = pd.concat([
        execution_info,
        functions.parameters_to_data_frame(parameters_dict),
        functions.predictions_to_data_frame(predictions_dict, 1),
    ], axis=1)
    parameters_dict['set_num_accepted_probs'] = {1}
    excel_file1 = utils.generate_available_filename(ext='.xlsx')
    excel_file2 = utils.generate_available_filename(ext='.xlsx')
    expected_df = pd.DataFrame()
    try:
        for i, file_exists in enumerate([False, True]):
            assert exists(excel_file1) is file_exists
            df = functions.generate_report(execution_info, parameters_dict, predictions_dict, excel_file1)
            df.to_excel(excel_file2, index=False)
            assert df.shape == (i + 1, 44)
            expected_df = pd.concat([expected_df, expected_df_row0])
            pd.util.testing.assert_frame_equal(df, expected_df)
            pd.util.testing.assert_frame_equal(pd.read_excel(excel_file1), pd.read_excel(excel_file2))
    finally:
        utils.remove_and_check(excel_file1)
        utils.remove_and_check(excel_file2)
Exemple #2
0
def test_dump_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    try:
        path = create_temporary_file(content=None, text=False)
        with pytest.raises(Exception):
            pickle_manager.dump_documents(docs1, path)
    finally:
        remove_and_check(path)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
        docs2 = list(pickle_manager.get_documents(filename))
        assert len(metadata) == 1
        assert metadata['total'] == len(docs1)
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
    finally:
        remove_and_check(filename)
    captured = capsys.readouterr()
    assert captured.out == ''
    assert captured.err[captured.err.rfind('\r') +
                        1:].startswith('Storing documents: 100%|')
    assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
        's/doc]\n')
Exemple #3
0
def test_prepare(capsys):
    text_field = 'text field'
    class_field = 'class field'
    quantity = 2
    fields = {text_field: 'Teste value.', class_field: 'c1'}
    analyzed_sentences = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * quantity
    }
    docs1 = [
        Document(index=0, fields=fields,
                 analyzed_sentences=analyzed_sentences),
        Document(index=1, fields=fields, analyzed_sentences=None),
    ]
    synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt']
    expected_corpus_str = [[' '.join(['teste value'] * quantity), ''],
                           [' '.join(['prova value'] * quantity), '']]
    expected_classifications = [[fields[class_field]] * quantity
                                ] * len(synonyms_files)
    expected_idxs_to_remove = [[1]] * len(synonyms_files)
    expected_corpus = [[['teste', 'value'] * quantity, []],
                       [['prova', 'value'] * quantity, []]]
    try:
        filename = generate_available_filename()
        pickle_manager.dump_documents(docs1, filename)
        for i, synonyms_file in enumerate(synonyms_files):
            ft = FeatureExtractor(synonyms_file=synonyms_file)
            for training_mode in [True, False]:
                corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare(
                    text_field, class_field, None, docs1, training_mode)
                corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare(
                    text_field, class_field, filename, None, training_mode)
                assert (corpus_str1, classifications1, idxs_to_remove1,
                        corpus1) == (corpus_str2, classifications2,
                                     idxs_to_remove2, corpus2)
                assert corpus_str1 == expected_corpus_str[i]
                assert classifications1 == expected_classifications[i]
                assert idxs_to_remove1 == expected_idxs_to_remove[i]
                assert corpus1 == expected_corpus[i]
                captured = capsys.readouterr()
                assert captured.out == ''
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Preparing to create classification: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
            if synonyms_file is not None:
                remove_and_check(synonyms_file)
    finally:
        remove_and_check(filename)
Exemple #4
0
def test_check_data(capsys):
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        pickle_manager.check_data(filename)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Checking data: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        count = 10
        metadata1 = {'total': count}
        pda1 = pickle_manager.PickleDumpAppend(metadata1, filename)
        for not_Document in range(count):
            pda1.dump_append(not_Document)
        pda1.close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
        metadata2 = {'total': -1}
        pickle_manager.PickleDumpAppend(metadata2, filename).close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
    finally:
        remove_and_check(filename)
Exemple #5
0
def test_dump_json():
    d1 = {'test_random_values': [np.random.random()]}
    filename = utils.generate_available_filename()
    try:
        trainer.dump_json(d1, filename)
        f = open(filename, 'r')
        d2 = json.load(f)
    finally:
        f.close()
        utils.remove_and_check(filename)
    assert d1 == d2
Exemple #6
0
def test_get_docs_metadata():
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
    finally:
        remove_and_check(filename)
    assert type(metadata) is dict
    assert len(metadata) == 1
    assert metadata['total'] == len(docs)
Exemple #7
0
def test_generate_roc_plot():
    filename = '%s.png' % (generate_available_filename())
    for n_class in [2, 10]:
        X_test, y_test = load_digits(n_class=n_class, return_X_y=True)
        for f in clfs:
            clf = f(n_jobs=1, class_weight=None)
            clf.fit(X_test, y_test)
            assert not exists(filename)
            try:
                classifiers.generate_roc_plot(clf, X_test, y_test, filename)
                assert exists(filename)
            finally:
                remove_and_check(filename)
Exemple #8
0
def test_PickleDumpAppend_close():
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        for expected_value in [False, True]:
            assert exists(filename) == expected_value
            pda = pickle_manager.PickleDumpAppend(metadata, filename)
            assert not pda.file.closed
            assert exists(pda.file.name)
            pda.close()
            assert pda.file.closed
            assert not exists(pda.file.name)
    finally:
        remove_and_check(filename)
def test___init__():
    filename = generate_available_filename()
    try:
        assert not exists(filename)
        parser = ContoPTParser(filename)
        assert exists(filename)
        synonyms = ContoPTParser._load_synonyms(filename)
    finally:
        remove_and_check(filename)
    assert type(parser.synonyms) is dict
    assert len(parser.synonyms) > 0
    assert parser.synonyms == synonyms
    assert synonyms['adjudicatário'] == 'adjudicante'
    assert synonyms['melancolia'] == 'misantropia'
    assert synonyms['tristeza'] == 'misantropia'
Exemple #10
0
def test_main():
    old_dir = os.getcwd()
    new_dir = utils.generate_available_filename()
    base_parameters = Parameters(utils.config_file)
    base_parameters.preprocessed_data_file = os.path.basename(base_parameters.preprocessed_data_file)
    try:
        os.makedirs(new_dir, exist_ok=False)
        os.chdir(new_dir)
        parameters = deepcopy(base_parameters)
        parameters.excel_file = "invalid_excel_file"
        parameters.preprocessed_data_file = "invalid_data_file"
        with pytest.raises(SystemExit):
            trainer.main(parameters)
        parameters = deepcopy(base_parameters)
        assert not os.path.exists(parameters.preprocessed_data_file)
        try:
            trainer.main(parameters)
            assert os.path.exists(parameters.preprocessed_data_file)
            assert os.path.exists("predictions.json")
            assert os.path.exists("report.xlsx")
        finally:
            utils.remove_and_check(parameters.preprocessed_data_file)
            utils.remove_and_check("predictions.json")
            utils.remove_and_check("report.xlsx")
        parameters.excel_file = os.path.abspath("20newsgroups")
        parameters.preprocess_data = False
        excel_file_20newsgroups = "20newsgroups.xlsx"
        assert not os.path.exists(excel_file_20newsgroups)
        try:
            trainer.main(parameters)
            pytest.fail()
        except SystemExit:
            assert os.path.exists(excel_file_20newsgroups)
        finally:
            utils.remove_and_check(excel_file_20newsgroups)
        parameters = deepcopy(base_parameters)
        parameters.final_training = True
        try:
            trainer.main(parameters)
        finally:
            assert not os.path.exists("predictions.json")
            assert not os.path.exists("report.xlsx")
            utils.remove_and_check(parameters.preprocessed_data_file)
    finally:
        os.chdir(old_dir)
        rmtree(new_dir)
Exemple #11
0
def test_PickleDumpAppend_dump_append():
    count = 10
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        pda = pickle_manager.PickleDumpAppend(metadata, filename)
        for i in range(count):
            pda.dump_append(i)
        pda.close()
        input_file = open(filename, 'rb')
        assert load(input_file) == metadata
        for i in range(count):
            data = load(input_file)
            assert data == i
    finally:
        input_file.close()
        remove_and_check(filename)
def test_get_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        dump_documents(docs1, filename)
        for d1, d2 in [(None, '100%|'),
                       ('Loading documents', 'Loading documents: 100%|')]:
            docs2 = list(get_documents(filename, description=d1))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            captured = capsys.readouterr()
            assert captured.out == ''
            assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2)
            assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
                's/doc]\n')
    finally:
        remove_and_check(filename)
Exemple #13
0
def test_generate_X_y(capsys):
    quantity = 2
    corpus = ['Test lemma 1 . ' * quantity, 'Test lemma 2 . ' * quantity]
    classifications = [1, 2]
    filename = generate_available_filename()
    dpe_out = 'Please, ignore the message above indicating that the sentence is too long. The problem has been solved.\n' * 6
    combinations = [
        ('CountVectorizer', None, True, ''),
        ('CountVectorizer', 'LDA', True, ''),
        ('CountVectorizer', 'MDS', True, ''),
        ('HashingVectorizer', None, True, ''),
        ('HashingVectorizer', 'MDS', True, ''),
        ('TfidfVectorizer', None, True, ''),
        ('TfidfVectorizer', 'LDA', True, ''),
        ('TfidfVectorizer', 'MDS', True, ''),
        ('DocumentPoolEmbeddings', None, False, dpe_out),
        ('DocumentPoolEmbeddings', 'MDS', False, dpe_out),
    ]
    for vectorizer, fr, expect_file, expected_out in combinations:
        try:
            ft = FeatureExtractor(vectorizer_name=vectorizer,
                                  feature_reduction=fr,
                                  vectorizer_file=filename)
            for training_mode in [True, False]:
                assert exists(filename) is (not training_mode and expect_file)
                _X, y = ft.generate_X_y(corpus, classifications, training_mode)
                assert exists(filename) is expect_file
                assert y == classifications
                captured = capsys.readouterr()
                assert captured.out == expected_out
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Extracting features: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
        finally:
            if expect_file:
                remove_and_check(filename)
            if fr == 'LDA':
                remove_and_check('LatentDirichletAllocation.pkl')
    with pytest.raises(ValueError):
        FeatureExtractor(feature_reduction='invalid',
                         vectorizer_file=filename).generate_X_y(
                             corpus, classifications)
    remove_and_check(filename)
Exemple #14
0
def test_PickleDumpAppend___init__():
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        not_dict = 'test_str'
        not_str = -1
        params = [[not_dict, filename], [metadata, not_str]]
        for m, f in params:
            with pytest.raises(AssertionError):
                pda = pickle_manager.PickleDumpAppend(m, f)
        pda = pickle_manager.PickleDumpAppend(metadata, filename)
        assert pda.filename_upon_completion == filename
        assert exists(pda.file.name)
        pda.close()
        assert pickle_manager.load(filename) == metadata
        assert not exists(pda.file.name)
        assert exists(filename)
    finally:
        remove_and_check(filename)
def test_get_dict():
    for data_dir in [mkdtemp(), generate_available_filename()]:
        try:
            for lang in ['en_US', 'pt_PT']:
                aff_file = os.path.join(data_dir, '%s.aff' % (lang))
                dic_file = os.path.join(data_dir, '%s.dic' % (lang))
                assert not os.path.exists(aff_file)
                assert not os.path.exists(dic_file)
                SpellChecker.get_dict(lang, data_dir)
                assert os.path.exists(aff_file)
                assert os.path.exists(dic_file)
                aff_mtime = os.path.getmtime(aff_file)
                dic_mtime = os.path.getmtime(dic_file)
                SpellChecker.get_dict(lang, data_dir)
                assert aff_mtime == os.path.getmtime(aff_file)
                assert dic_mtime == os.path.getmtime(dic_file)
            with pytest.raises(requests.HTTPError):
                SpellChecker.get_dict('pt_NotExists', data_dir) 
        finally:
            rmtree(data_dir)
Exemple #16
0
def test_set_docs_metadata(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata1 = pickle_manager.get_docs_metadata(filename)
        metadata2 = metadata1.copy()
        metadata2['new_field'] = 'test_field_value'
        assert metadata1 != metadata2
        pickle_manager.set_docs_metadata(metadata2, filename)
        assert pickle_manager.get_docs_metadata(filename) == metadata2
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n'
    finally:
        remove_and_check(filename)
Exemple #17
0
def test_load_20newsgroups():
    p1 = Parameters(utils.config_file)
    p1.excel_file = '20newsgroups'
    excel_file = utils.generate_available_filename('.xlsx')
    try:
        p2 = trainer.load_20newsgroups(p1, excel_file)
        assert p1 is not p2
        assert p1 != p2
        assert p2.excel_column_with_text_data == 'data'
        assert p2.excel_column_with_classification_data == 'target'
        assert os.path.exists(excel_file)
        df = pd.read_excel(excel_file)
        assert df.shape == (18846, 3)
        assert list(df.keys()) == ['Unnamed: 0', 'data', 'target']
        expected_mtime = os.path.getmtime(excel_file)
        p3 = trainer.load_20newsgroups(p1, excel_file)
        assert os.path.getmtime(excel_file) == expected_mtime
        assert p3.__dict__ == p2.__dict__
    finally:
        utils.remove_and_check('20news-bydate_py3.pkz')
        utils.remove_and_check(excel_file)
Exemple #18
0
def test_LatentDirichletAllocation():
    X = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    y = np.asarray([0, 1, 2])
    expected_new_X = np.asarray([[
        0.01428573, 0.87142845, 0.01428573, 0.01428573, 0.01428573, 0.01428573,
        0.01428573, 0.01428573, 0.01428573, 0.01428573
    ],
                                 [
                                     0.00625001, 0.94374995, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001, 0.00625001, 0.00625001,
                                     0.00625001
                                 ],
                                 [
                                     0.00400000, 0.96399997, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000, 0.00400000, 0.00400000,
                                     0.00400000
                                 ]])
    filename = generate_available_filename()
    assert not exists(filename)
    try:
        new_X1, new_y1 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert exists(filename)
        assert new_X1.shape == (X.shape[0], 10)
        assert np.allclose(expected_new_X, new_X1)
        assert np.array_equal(y, new_y1)
        mtime = getmtime(filename)
        new_X2, new_y2 = FeatureExtractor.LatentDirichletAllocation(
            X, y, filename)
        assert getmtime(filename) == mtime
        assert np.array_equal(new_X1, new_X2)
        assert np.array_equal(new_y1, new_y2)
    finally:
        remove_and_check(filename)
Exemple #19
0
def test_get_documents():
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        try:
            f = open(filename, 'ab')
            dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol)
            f.close()
            docs2 = list(pickle_manager.get_documents(filename))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            pytest.fail()
        except Exception as e:
            assert len(e.args) == 1
            assert e.args[
                0] == "The file '%s' has more documents than indicated in the metadata." % (
                    filename)
    finally:
        remove_and_check(filename)
Exemple #20
0
def test_predict(client):
    df = read_excel(utils.example_excel_file)
    docs = data_frame_to_document_list(df)
    prediction_server._text_field = 'Example column'
    prediction_server._class_field = 'Classification column'
    clfs_filenames = []
    try:
        vectorizer_path = utils.create_temporary_file(content=None, text=False)
        p = Preprocessor()
        p.preprocess(text_field=prediction_server._text_field,
                     preprocessed_data_file=None,
                     docs=docs)
        ft = FeatureExtractor(training_mode=True,
                              vectorizer_file=vectorizer_path)
        corpus, classifications, _, _ = ft.prepare(
            text_field=prediction_server._text_field,
            class_field=prediction_server._class_field,
            preprocessed_data_file=None,
            docs=docs,
            training_mode=True)
        X, y = ft.generate_X_y(corpus, classifications, training_mode=True)
        prediction_server._preprocessor = Preprocessor()
        prediction_server._feature_extractor = FeatureExtractor(
            training_mode=False, vectorizer_file=vectorizer_path)
        res = client.post('/', headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        res = client.post('/',
                          json={
                              'text': 1,
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid text</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 1
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': '../LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(res.data).endswith('<p>Invalid classifier</p>\n')
        res = client.post('/',
                          json={
                              'text': 'Test text.',
                              'classifier': 'LinearSVC'
                          },
                          headers=valid_headers)
        assert res.status_code == prediction_server.BAD_REQUEST
        assert utils.decode(
            res.data).endswith('<p>Invalid classifier model</p>\n')
        for f in clfs:
            clf_filename_base = utils.generate_available_filename()
            clf_filename = '%s.pkl' % (clf_filename_base)
            clfs_filenames.append(clf_filename)
            clf = f(n_jobs=1, class_weight=None)
            clf.fit(X, y)
            dump(clf, clf_filename)
            res = client.post('/',
                              json={
                                  'text': 'Test text.',
                                  'classifier': clf_filename_base
                              },
                              headers=valid_headers)
            assert res.status_code == 200
            assert repr(
                prediction_server._classifiers[clf_filename_base]) == repr(clf)
            assert replace_final_dict_values(res.json, value=0) in [{
                'feature_weights': {
                    'I': {},
                    'II': {},
                    'III': {}
                },
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }, {
                'feature_weights': {},
                'probabilities': {
                    'I': 0,
                    'II': 0,
                    'III': 0
                }
            }]
    finally:
        utils.remove_and_check(vectorizer_path)
        for clf_filename in clfs_filenames:
            utils.remove_and_check(clf_filename)
        prediction_server._text_field = None
        prediction_server._class_field = None
        prediction_server._preprocessor = None
        prediction_server._feature_extractor = None
        prediction_server._feature_weights = dict()
        prediction_server._classifiers = dict()
        prediction_server._old_handlers = dict()
Exemple #21
0
def test_preprocess(capsys):
    text_field = 'Test field'
    index = -1
    fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2}
    analyzed_sentences1 = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tikens',
            'lemma': 'tikens',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    analyzed_sentences2 = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tokens',
            'lemma': 'token',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1),
                                                   ('en_US',
                                                    analyzed_sentences2)]:
        doc = Document(index=index, fields=fields, analyzed_sentences=dict())
        p = Preprocessor(spell_checker_lang=spell_checker_lang)
        assert p.stop is False
        p.preprocess(text_field=text_field,
                     preprocessed_data_file=None,
                     docs=[doc] * 2)
        assert p.stop is False
        assert doc.index == index
        assert doc.fields == fields
        assert doc.analyzed_sentences == analyzed_sentences
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Preprocessing: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        p.stop = True
        with pytest.raises(SystemExit):
            p.preprocess(text_field=text_field,
                         preprocessed_data_file=None,
                         docs=[doc] * 2)
        del (p)
        if spell_checker_lang is not None:
            rmtree('./hunspell')
    docs = [
        Document(index=index, fields=fields, analyzed_sentences=dict())
        for index in range(2)
    ]
    preprocessed_data_file = utils.generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        pickle_manager.check_data(preprocessed_data_file)
        p = Preprocessor(store_data=True)
        assert all([
            doc.analyzed_sentences == dict()
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        p.preprocess(text_field, preprocessed_data_file, None)
        assert all([
            doc.analyzed_sentences == analyzed_sentences1
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        pickle_manager.check_data(preprocessed_data_file)
    finally:
        utils.remove_and_check(preprocessed_data_file)
Exemple #22
0
def test___init__():
    ft1 = FeatureExtractor()
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        pytest.fail()
    assert ft1.stop_words == set()
    assert ft1.vectorizer_file == 'vectorizer.pkl'
    assert type(ft1.vectorizer) is feature_extraction.text.TfidfVectorizer
    assert ft1.feature_reduction is None
    assert 'initial_code_to_run_on_document' in dir(
        ft1.document_adjustment_code)
    assert ft1.upostags_to_ignore == ['PUNCT']
    assert ft1.synonyms is None
    assert ft1.n_jobs == 1
    ft2 = FeatureExtractor(nltk_stop_words_package='english')
    assert ft2.stop_words == set(nltk.corpus.stopwords.words('english'))
    for vectorizer_name in [
            'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer'
    ]:
        with pytest.raises(FileNotFoundError):
            FeatureExtractor(vectorizer_name=vectorizer_name,
                             training_mode=False,
                             vectorizer_file=generate_available_filename())
        try:
            path = create_temporary_file(content=None, text=False)
            pickle_manager.dump(
                FeatureExtractor(vectorizer_name=vectorizer_name).vectorizer,
                path)
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=False,
                                  vectorizer_file=path)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
        finally:
            remove_and_check(path)
        ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                              training_mode=True)
        assert ft.vectorizer.__class__.__name__ == vectorizer_name
    for vectorizer_name in ['DocumentPoolEmbeddings']:
        for training_mode in [True, False]:
            vectorizer_file = generate_available_filename()
            ft = FeatureExtractor(vectorizer_name=vectorizer_name,
                                  training_mode=training_mode,
                                  vectorizer_file=vectorizer_file)
            assert ft.vectorizer.__class__.__name__ == vectorizer_name
            assert not exists(vectorizer_file)
    with pytest.raises(ValueError):
        FeatureExtractor(vectorizer_name='invalid_vectorizer',
                         training_mode=True)
    ft3 = FeatureExtractor(remove_adjectives=True)
    assert ft3.upostags_to_ignore == ['PUNCT', 'ADJ']
    synonyms_file = 'contopt_0.1_r2_c0.0.txt'
    filename = generate_available_filename()
    try:
        ft4 = FeatureExtractor(synonyms_file=synonyms_file)
        contoPTParser = ContoPTParser(filename)
        assert ft4.synonyms == contoPTParser.synonyms
    finally:
        remove_and_check(synonyms_file)
        remove_and_check(filename)
    with pytest.raises(ValueError):
        FeatureExtractor(synonyms_file='invalid_file.txt')
    ft5 = FeatureExtractor(n_jobs=2)
    assert ft5.n_jobs == 2
Exemple #23
0
def test_train_test_split():
    text_field = 'Example column'
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    preprocessor = Preprocessor()
    preprocessor.preprocess(text_field, None, docs)
    ft = FeatureExtractor()
    corpus, classifications, _, _ = ft.prepare(
        text_field=text_field,
        class_field='Classification column',
        preprocessed_data_file=None,
        docs=docs,
        training_mode=False)
    test_size = 0.3
    preprocessed_data_file = generate_available_filename()
    force = False
    idxs_to_remove = [5]
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        assert pickle_manager.get_docs_metadata(preprocessed_data_file) == {
            'total': 10
        }
        desired = {
            'total': 10,
            'test_size': test_size,
            'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]),
            'test_set_indexes': np.array([7, 9, 4])
        }
        for my_force in [False, True]:
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, my_force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key in ['test_size', 'training_set_indexes', 'test_set_indexes']:
            m = desired.copy()
            m[key] = None
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key, value in [('test_size', 0.2),
                           ('training_set_indexes', np.array([1, 0, 2, 8,
                                                              3]))]:
            m = desired.copy()
            m[key] = value
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file), m)
    finally:
        remove_and_check(preprocessed_data_file)
    pass