Esempio n. 1
0
def test_dump_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    try:
        path = create_temporary_file(content=None, text=False)
        with pytest.raises(Exception):
            pickle_manager.dump_documents(docs1, path)
    finally:
        remove_and_check(path)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
        docs2 = list(pickle_manager.get_documents(filename))
        assert len(metadata) == 1
        assert metadata['total'] == len(docs1)
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
    finally:
        remove_and_check(filename)
    captured = capsys.readouterr()
    assert captured.out == ''
    assert captured.err[captured.err.rfind('\r') +
                        1:].startswith('Storing documents: 100%|')
    assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
        's/doc]\n')
Esempio n. 2
0
def test_check_data(capsys):
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        pickle_manager.check_data(filename)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Checking data: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        count = 10
        metadata1 = {'total': count}
        pda1 = pickle_manager.PickleDumpAppend(metadata1, filename)
        for not_Document in range(count):
            pda1.dump_append(not_Document)
        pda1.close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
        metadata2 = {'total': -1}
        pickle_manager.PickleDumpAppend(metadata2, filename).close()
        with pytest.raises(AssertionError):
            pickle_manager.check_data(filename)
    finally:
        remove_and_check(filename)
Esempio n. 3
0
def test_prepare(capsys):
    text_field = 'text field'
    class_field = 'class field'
    quantity = 2
    fields = {text_field: 'Teste value.', class_field: 'c1'}
    analyzed_sentences = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * quantity
    }
    docs1 = [
        Document(index=0, fields=fields,
                 analyzed_sentences=analyzed_sentences),
        Document(index=1, fields=fields, analyzed_sentences=None),
    ]
    synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt']
    expected_corpus_str = [[' '.join(['teste value'] * quantity), ''],
                           [' '.join(['prova value'] * quantity), '']]
    expected_classifications = [[fields[class_field]] * quantity
                                ] * len(synonyms_files)
    expected_idxs_to_remove = [[1]] * len(synonyms_files)
    expected_corpus = [[['teste', 'value'] * quantity, []],
                       [['prova', 'value'] * quantity, []]]
    try:
        filename = generate_available_filename()
        pickle_manager.dump_documents(docs1, filename)
        for i, synonyms_file in enumerate(synonyms_files):
            ft = FeatureExtractor(synonyms_file=synonyms_file)
            for training_mode in [True, False]:
                corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare(
                    text_field, class_field, None, docs1, training_mode)
                corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare(
                    text_field, class_field, filename, None, training_mode)
                assert (corpus_str1, classifications1, idxs_to_remove1,
                        corpus1) == (corpus_str2, classifications2,
                                     idxs_to_remove2, corpus2)
                assert corpus_str1 == expected_corpus_str[i]
                assert classifications1 == expected_classifications[i]
                assert idxs_to_remove1 == expected_idxs_to_remove[i]
                assert corpus1 == expected_corpus[i]
                captured = capsys.readouterr()
                assert captured.out == ''
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Preparing to create classification: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
            if synonyms_file is not None:
                remove_and_check(synonyms_file)
    finally:
        remove_and_check(filename)
Esempio n. 4
0
def test_get_docs_metadata():
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, filename)
        metadata = pickle_manager.get_docs_metadata(filename)
    finally:
        remove_and_check(filename)
    assert type(metadata) is dict
    assert len(metadata) == 1
    assert metadata['total'] == len(docs)
Esempio n. 5
0
def main(parameters):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [functions.get_local_time_str()]
    logger.debug("Starting execution.")
    if basename(parameters.excel_file) == '20newsgroups':
        parameters = load_20newsgroups(parameters)
    if parameters.preprocess_data:
        if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file):
            logger.error("Please, provide a valid Excel file or a valid preprocessed data file.")
            quit()
        if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file):
            logger.info("Loading Excel file.")
            data_frame = pd.read_excel(parameters.excel_file)
            data_frame = data_frame.fillna("NaN")
            logger.info("Creating documents.")
            docs = functions.data_frame_to_document_list(data_frame)
            logger.info("Storing generated documents.")
            pickle_manager.dump_documents(docs, parameters.preprocessed_data_file)
        logger.info("Preprocessing documents.")
        preprocessor = Preprocessor(mosestokenizer_language_code=parameters.mosestokenizer_language_code, store_data=True, spell_checker_lang=parameters.spell_checker_lang, n_jobs=parameters.number_of_jobs)
        preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file)
        logger.info("Checking generated data.")
        pickle_manager.check_data(parameters.preprocessed_data_file)
    else:
        if not isfile(parameters.preprocessed_data_file):
            logger.error("The indicated preprocessed data file does not exist.")
            quit()
    logger.info("Extracting features and splitting dataset into training and test subsets.")
    feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=True, feature_reduction=parameters.feature_reduction, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, n_jobs=parameters.number_of_jobs)
    corpus, classifications, idxs_to_remove, _docs_lemmas = feature_extractor.prepare(text_field=parameters.excel_column_with_text_data, class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file)
    if parameters.final_training:
        X_train, y_train = feature_extractor.generate_X_y(corpus, classifications, training_mode=True)
    else:
        corpus_train, corpus_test, classifications_train, classifications_test = train_test_split(corpus, classifications, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration, idxs_to_remove)
        X_train, y_train = feature_extractor.generate_X_y(corpus_train, classifications_train, training_mode=True)
        X_test, y_test = feature_extractor.generate_X_y(corpus_test, classifications_test, training_mode=False)
    X_train, y_train = resample(parameters.resampling, X_train, y_train)
    logger.info("Running classifiers.")
    p = classifiers.Pipeline(parameters.classifiers)
    logger.info("Accuracies:")
    if parameters.final_training:
        p.start(X_train, y_train, n_jobs=parameters.number_of_jobs, set_n_accepted_probs=parameters.set_num_accepted_probs, class_weight=parameters.class_weights, generate_roc_plots=parameters.generate_roc_plots)
    else:
        predictions_dict = p.start(X_train, y_train, X_test, y_test, parameters.number_of_jobs, parameters.set_num_accepted_probs, parameters.class_weights, parameters.generate_roc_plots)
        dump_json(predictions_dict, 'predictions.json')
    execution_info['End date'] = [functions.get_local_time_str()]
    logger.debug("Execution completed.")
    if not parameters.final_training:
        functions.generate_report(execution_info, parameters.__dict__, predictions_dict)
def test_get_documents(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        dump_documents(docs1, filename)
        for d1, d2 in [(None, '100%|'),
                       ('Loading documents', 'Loading documents: 100%|')]:
            docs2 = list(get_documents(filename, description=d1))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            captured = capsys.readouterr()
            assert captured.out == ''
            assert captured.err[captured.err.rfind('\r') + 1:].startswith(d2)
            assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
                's/doc]\n')
    finally:
        remove_and_check(filename)
Esempio n. 7
0
def test_set_docs_metadata(capsys):
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        metadata1 = pickle_manager.get_docs_metadata(filename)
        metadata2 = metadata1.copy()
        metadata2['new_field'] = 'test_field_value'
        assert metadata1 != metadata2
        pickle_manager.set_docs_metadata(metadata2, filename)
        assert pickle_manager.get_docs_metadata(filename) == metadata2
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:] == 'Storing subsets: 0MB [00:00, ?MB/s]\n'
    finally:
        remove_and_check(filename)
Esempio n. 8
0
def test_get_documents():
    df = read_excel(example_excel_file)
    docs1 = data_frame_to_document_list(df)
    filename = generate_available_filename()
    try:
        pickle_manager.dump_documents(docs1, filename)
        docs2 = list(pickle_manager.get_documents(filename))
        for doc1, doc2 in zip_longest(docs1, docs2):
            assert repr(doc1) == repr(doc2)
        try:
            f = open(filename, 'ab')
            dump(obj=0, file=f, protocol=pickle_manager._pickle_protocol)
            f.close()
            docs2 = list(pickle_manager.get_documents(filename))
            for doc1, doc2 in zip_longest(docs1, docs2):
                assert repr(doc1) == repr(doc2)
            pytest.fail()
        except Exception as e:
            assert len(e.args) == 1
            assert e.args[
                0] == "The file '%s' has more documents than indicated in the metadata." % (
                    filename)
    finally:
        remove_and_check(filename)
Esempio n. 9
0
def test_train_test_split():
    text_field = 'Example column'
    df = read_excel(example_excel_file)
    docs = data_frame_to_document_list(df)
    preprocessor = Preprocessor()
    preprocessor.preprocess(text_field, None, docs)
    ft = FeatureExtractor()
    corpus, classifications, _, _ = ft.prepare(
        text_field=text_field,
        class_field='Classification column',
        preprocessed_data_file=None,
        docs=docs,
        training_mode=False)
    test_size = 0.3
    preprocessed_data_file = generate_available_filename()
    force = False
    idxs_to_remove = [5]
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        assert pickle_manager.get_docs_metadata(preprocessed_data_file) == {
            'total': 10
        }
        desired = {
            'total': 10,
            'test_size': test_size,
            'training_set_indexes': np.array([6, 1, 0, 2, 8, 3]),
            'test_set_indexes': np.array([7, 9, 4])
        }
        for my_force in [False, True]:
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, my_force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key in ['test_size', 'training_set_indexes', 'test_set_indexes']:
            m = desired.copy()
            m[key] = None
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file),
                desired)
        for key, value in [('test_size', 0.2),
                           ('training_set_indexes', np.array([1, 0, 2, 8,
                                                              3]))]:
            m = desired.copy()
            m[key] = value
            pickle_manager.set_docs_metadata(m, preprocessed_data_file)
            train_test_split.train_test_split(corpus, classifications,
                                              test_size,
                                              preprocessed_data_file, force,
                                              idxs_to_remove)
            np.testing.assert_equal(
                pickle_manager.get_docs_metadata(preprocessed_data_file), m)
    finally:
        remove_and_check(preprocessed_data_file)
    pass
Esempio n. 10
0
def test_preprocess(capsys):
    text_field = 'Test field'
    index = -1
    fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2}
    analyzed_sentences1 = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tikens',
            'lemma': 'tikens',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    analyzed_sentences2 = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tokens',
            'lemma': 'token',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1),
                                                   ('en_US',
                                                    analyzed_sentences2)]:
        doc = Document(index=index, fields=fields, analyzed_sentences=dict())
        p = Preprocessor(spell_checker_lang=spell_checker_lang)
        assert p.stop is False
        p.preprocess(text_field=text_field,
                     preprocessed_data_file=None,
                     docs=[doc] * 2)
        assert p.stop is False
        assert doc.index == index
        assert doc.fields == fields
        assert doc.analyzed_sentences == analyzed_sentences
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Preprocessing: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        p.stop = True
        with pytest.raises(SystemExit):
            p.preprocess(text_field=text_field,
                         preprocessed_data_file=None,
                         docs=[doc] * 2)
        del (p)
        if spell_checker_lang is not None:
            rmtree('./hunspell')
    docs = [
        Document(index=index, fields=fields, analyzed_sentences=dict())
        for index in range(2)
    ]
    preprocessed_data_file = utils.generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        pickle_manager.check_data(preprocessed_data_file)
        p = Preprocessor(store_data=True)
        assert all([
            doc.analyzed_sentences == dict()
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        p.preprocess(text_field, preprocessed_data_file, None)
        assert all([
            doc.analyzed_sentences == analyzed_sentences1
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        pickle_manager.check_data(preprocessed_data_file)
    finally:
        utils.remove_and_check(preprocessed_data_file)