Exemple #1
0
def test_prepare(capsys):
    text_field = 'text field'
    class_field = 'class field'
    quantity = 2
    fields = {text_field: 'Teste value.', class_field: 'c1'}
    analyzed_sentences = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * quantity
    }
    docs1 = [
        Document(index=0, fields=fields,
                 analyzed_sentences=analyzed_sentences),
        Document(index=1, fields=fields, analyzed_sentences=None),
    ]
    synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt']
    expected_corpus_str = [[' '.join(['teste value'] * quantity), ''],
                           [' '.join(['prova value'] * quantity), '']]
    expected_classifications = [[fields[class_field]] * quantity
                                ] * len(synonyms_files)
    expected_idxs_to_remove = [[1]] * len(synonyms_files)
    expected_corpus = [[['teste', 'value'] * quantity, []],
                       [['prova', 'value'] * quantity, []]]
    try:
        filename = generate_available_filename()
        pickle_manager.dump_documents(docs1, filename)
        for i, synonyms_file in enumerate(synonyms_files):
            ft = FeatureExtractor(synonyms_file=synonyms_file)
            for training_mode in [True, False]:
                corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare(
                    text_field, class_field, None, docs1, training_mode)
                corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare(
                    text_field, class_field, filename, None, training_mode)
                assert (corpus_str1, classifications1, idxs_to_remove1,
                        corpus1) == (corpus_str2, classifications2,
                                     idxs_to_remove2, corpus2)
                assert corpus_str1 == expected_corpus_str[i]
                assert classifications1 == expected_classifications[i]
                assert idxs_to_remove1 == expected_idxs_to_remove[i]
                assert corpus1 == expected_corpus[i]
                captured = capsys.readouterr()
                assert captured.out == ''
                assert captured.err[captured.err.rfind('\r') + 1:].startswith(
                    'Preparing to create classification: 100%|')
                assert captured.err.endswith(
                    'doc/s]\n') or captured.err.endswith('s/doc]\n')
            if synonyms_file is not None:
                remove_and_check(synonyms_file)
    finally:
        remove_and_check(filename)
Exemple #2
0
def test__filter():
    text_field = 'text field'
    fields = {text_field: 'Teste value.', 'class field': 'c1'}
    doc = Document(index=-1, fields=fields, analyzed_sentences=None)
    upostags_to_ignore = ['PUNCT']
    assert FeatureExtractor._filter(doc, text_field, upostags_to_ignore) == []
    doc.analyzed_sentences = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    assert FeatureExtractor._filter(
        doc, text_field, upostags_to_ignore) == ['test', 'value'] * 2
    upostags_to_ignore.clear()
    assert FeatureExtractor._filter(
        doc, text_field, upostags_to_ignore) == ['test', 'value', '.'] * 2
Exemple #3
0
def test_from_data_frame():
    not_data_frame = 'test_str'
    not_int = 'test_str'
    params = [[not_data_frame, -1], [DataFrame(), not_int]]
    for df, index in params:
        with pytest.raises(AssertionError):
            Document.from_data_frame(df, index)
    df = read_excel(example_excel_file)
    index = 0
    doc = Document.from_data_frame(df, index)
    assert doc.index == index
    assert sorted(doc.fields.keys()) == sorted(df.columns)
    assert str(doc.fields) == str(df.to_dict('records')[index])
    assert doc.analyzed_sentences == dict()
Exemple #4
0
def predict():
    global _text_field, _class_field, _preprocessor, _feature_extractor, _classifiers
    if not request.json:
        abort(BAD_REQUEST)
    text = request.json.get('text')
    classifier = request.json.get('classifier')
    if type(text) is not str:
        abort(BAD_REQUEST, 'Invalid text')
    if type(classifier) is not str:
        abort(BAD_REQUEST, 'Invalid classifier')
    if basename(classifier) != classifier:
        abort(BAD_REQUEST, 'Invalid classifier')
    doc = Document(index=-1, fields=dict({_text_field: text, _class_field: None}), analyzed_sentences=dict())
    _preprocessor.preprocess(text_field=_text_field, docs=[doc])
    corpus, classifications, _idxs_to_remove, docs_lemmas = _feature_extractor.prepare(text_field=_text_field, class_field=_class_field, docs=[doc], training_mode=False)
    X, _y = _feature_extractor.generate_X_y(corpus, classifications, training_mode=False)
    try:
        clf = _classifiers.get(classifier)
        if clf is None:
            clf = pickle_manager.load("%s.pkl" % classifier)
            _classifiers[classifier] = clf
        y_predict_proba = clf.predict_proba(X)
        probabilities = classifiers.predict_proba_to_dicts(clf.classes_, y_predict_proba)[0]
        feature_weights = get_feature_weights(clf, docs_lemmas[0])
        probabilities = DataFrame({'probabilities': probabilities}).to_dict('dict')
        return jsonify({**probabilities, **feature_weights})
    except FileNotFoundError:
        abort(BAD_REQUEST, 'Invalid classifier model')
Exemple #5
0
def test_copy():
    test_dict = {'test_field': 'test_value'}
    for analyzed_sentences in [None, test_dict]:
        doc1 = Document(index=-1,
                        fields=test_dict,
                        analyzed_sentences=analyzed_sentences)
        doc2 = doc1.copy()
        assert doc1 is not doc2
        assert doc1.__dict__ == doc2.__dict__
        vars1 = doc1.__dict__
        vars2 = doc2.__dict__
        assert vars1 == vars2
        assert all([
            type(var) is not int
            or doc1.__dict__[var] is not doc2.__dict__[var] for var in vars1
        ])
Exemple #6
0
def test___init__():
    index = -1
    fields = dict()
    analyzed_sentences = dict()
    doc = Document(index, fields, analyzed_sentences)
    assert doc.index == index
    assert doc.fields == fields
    assert doc.analyzed_sentences == analyzed_sentences
Exemple #7
0
def test_initial_code_to_run_on_document():
    index = -1
    fields = dict()
    analyzed_sentences = None
    doc = Document(index=index, fields=fields, analyzed_sentences=analyzed_sentences)
    assert doc.index == index
    assert doc.fields == fields
    assert doc.analyzed_sentences == analyzed_sentences
    initial_code_to_run_on_document(doc)
    assert doc.index == index
    assert doc.fields == fields
    assert doc.analyzed_sentences == analyzed_sentences
Exemple #8
0
def test___repr__():
    doc = Document(index=-1, fields=dict(), analyzed_sentences=dict())
    assert repr(
        doc
    ) == "Document: {'index': -1, 'fields': {}, 'analyzed_sentences': {}}"
Exemple #9
0
def test_preprocess(capsys):
    text_field = 'Test field'
    index = -1
    fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2}
    analyzed_sentences1 = {
        text_field: [[{
            'form': 'Teste',
            'lemma': 'teste',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tikens',
            'lemma': 'tikens',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    analyzed_sentences2 = {
        text_field: [[{
            'form': 'Test',
            'lemma': 'test',
            'upostag': None
        }, {
            'form': 'value',
            'lemma': 'value',
            'upostag': None
        }, {
            'form': 'with',
            'lemma': 'with',
            'upostag': None
        }, {
            'form': 'a',
            'lemma': 'a',
            'upostag': None
        }, {
            'form': 'few',
            'lemma': 'few',
            'upostag': None
        }, {
            'form': 'tokens',
            'lemma': 'token',
            'upostag': None
        }, {
            'form': '.',
            'lemma': '.',
            'upostag': 'PUNCT'
        }]] * 2
    }
    for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1),
                                                   ('en_US',
                                                    analyzed_sentences2)]:
        doc = Document(index=index, fields=fields, analyzed_sentences=dict())
        p = Preprocessor(spell_checker_lang=spell_checker_lang)
        assert p.stop is False
        p.preprocess(text_field=text_field,
                     preprocessed_data_file=None,
                     docs=[doc] * 2)
        assert p.stop is False
        assert doc.index == index
        assert doc.fields == fields
        assert doc.analyzed_sentences == analyzed_sentences
        captured = capsys.readouterr()
        assert captured.out == ''
        assert captured.err[captured.err.rfind('\r') +
                            1:].startswith('Preprocessing: 100%|')
        assert captured.err.endswith('doc/s]\n') or captured.err.endswith(
            's/doc]\n')
        p.stop = True
        with pytest.raises(SystemExit):
            p.preprocess(text_field=text_field,
                         preprocessed_data_file=None,
                         docs=[doc] * 2)
        del (p)
        if spell_checker_lang is not None:
            rmtree('./hunspell')
    docs = [
        Document(index=index, fields=fields, analyzed_sentences=dict())
        for index in range(2)
    ]
    preprocessed_data_file = utils.generate_available_filename()
    try:
        pickle_manager.dump_documents(docs, preprocessed_data_file)
        pickle_manager.check_data(preprocessed_data_file)
        p = Preprocessor(store_data=True)
        assert all([
            doc.analyzed_sentences == dict()
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        p.preprocess(text_field, preprocessed_data_file, None)
        assert all([
            doc.analyzed_sentences == analyzed_sentences1
            for doc in pickle_manager.get_documents(preprocessed_data_file)
        ])
        pickle_manager.check_data(preprocessed_data_file)
    finally:
        utils.remove_and_check(preprocessed_data_file)
Exemple #10
0
def data_frame_to_document_list(data_frame):
    documents = []
    for i in range(len(data_frame)):
        d = Document.from_data_frame(data_frame=data_frame, index=i)
        documents.append(d)
    return documents