def test_prepare(capsys): text_field = 'text field' class_field = 'class field' quantity = 2 fields = {text_field: 'Teste value.', class_field: 'c1'} analyzed_sentences = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * quantity } docs1 = [ Document(index=0, fields=fields, analyzed_sentences=analyzed_sentences), Document(index=1, fields=fields, analyzed_sentences=None), ] synonyms_files = [None, 'contopt_0.1_r2_c0.0.txt'] expected_corpus_str = [[' '.join(['teste value'] * quantity), ''], [' '.join(['prova value'] * quantity), '']] expected_classifications = [[fields[class_field]] * quantity ] * len(synonyms_files) expected_idxs_to_remove = [[1]] * len(synonyms_files) expected_corpus = [[['teste', 'value'] * quantity, []], [['prova', 'value'] * quantity, []]] try: filename = generate_available_filename() pickle_manager.dump_documents(docs1, filename) for i, synonyms_file in enumerate(synonyms_files): ft = FeatureExtractor(synonyms_file=synonyms_file) for training_mode in [True, False]: corpus_str1, classifications1, idxs_to_remove1, corpus1 = ft.prepare( text_field, class_field, None, docs1, training_mode) corpus_str2, classifications2, idxs_to_remove2, corpus2 = ft.prepare( text_field, class_field, filename, None, training_mode) assert (corpus_str1, classifications1, idxs_to_remove1, corpus1) == (corpus_str2, classifications2, idxs_to_remove2, corpus2) assert corpus_str1 == expected_corpus_str[i] assert classifications1 == expected_classifications[i] assert idxs_to_remove1 == expected_idxs_to_remove[i] assert corpus1 == expected_corpus[i] captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith( 'Preparing to create classification: 100%|') assert captured.err.endswith( 'doc/s]\n') or captured.err.endswith('s/doc]\n') if synonyms_file is not None: remove_and_check(synonyms_file) finally: remove_and_check(filename)
def test__filter(): text_field = 'text field' fields = {text_field: 'Teste value.', 'class field': 'c1'} doc = Document(index=-1, fields=fields, analyzed_sentences=None) upostags_to_ignore = ['PUNCT'] assert FeatureExtractor._filter(doc, text_field, upostags_to_ignore) == [] doc.analyzed_sentences = { text_field: [[{ 'form': 'Test', 'lemma': 'test', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } assert FeatureExtractor._filter( doc, text_field, upostags_to_ignore) == ['test', 'value'] * 2 upostags_to_ignore.clear() assert FeatureExtractor._filter( doc, text_field, upostags_to_ignore) == ['test', 'value', '.'] * 2
def test_from_data_frame(): not_data_frame = 'test_str' not_int = 'test_str' params = [[not_data_frame, -1], [DataFrame(), not_int]] for df, index in params: with pytest.raises(AssertionError): Document.from_data_frame(df, index) df = read_excel(example_excel_file) index = 0 doc = Document.from_data_frame(df, index) assert doc.index == index assert sorted(doc.fields.keys()) == sorted(df.columns) assert str(doc.fields) == str(df.to_dict('records')[index]) assert doc.analyzed_sentences == dict()
def predict(): global _text_field, _class_field, _preprocessor, _feature_extractor, _classifiers if not request.json: abort(BAD_REQUEST) text = request.json.get('text') classifier = request.json.get('classifier') if type(text) is not str: abort(BAD_REQUEST, 'Invalid text') if type(classifier) is not str: abort(BAD_REQUEST, 'Invalid classifier') if basename(classifier) != classifier: abort(BAD_REQUEST, 'Invalid classifier') doc = Document(index=-1, fields=dict({_text_field: text, _class_field: None}), analyzed_sentences=dict()) _preprocessor.preprocess(text_field=_text_field, docs=[doc]) corpus, classifications, _idxs_to_remove, docs_lemmas = _feature_extractor.prepare(text_field=_text_field, class_field=_class_field, docs=[doc], training_mode=False) X, _y = _feature_extractor.generate_X_y(corpus, classifications, training_mode=False) try: clf = _classifiers.get(classifier) if clf is None: clf = pickle_manager.load("%s.pkl" % classifier) _classifiers[classifier] = clf y_predict_proba = clf.predict_proba(X) probabilities = classifiers.predict_proba_to_dicts(clf.classes_, y_predict_proba)[0] feature_weights = get_feature_weights(clf, docs_lemmas[0]) probabilities = DataFrame({'probabilities': probabilities}).to_dict('dict') return jsonify({**probabilities, **feature_weights}) except FileNotFoundError: abort(BAD_REQUEST, 'Invalid classifier model')
def test_copy(): test_dict = {'test_field': 'test_value'} for analyzed_sentences in [None, test_dict]: doc1 = Document(index=-1, fields=test_dict, analyzed_sentences=analyzed_sentences) doc2 = doc1.copy() assert doc1 is not doc2 assert doc1.__dict__ == doc2.__dict__ vars1 = doc1.__dict__ vars2 = doc2.__dict__ assert vars1 == vars2 assert all([ type(var) is not int or doc1.__dict__[var] is not doc2.__dict__[var] for var in vars1 ])
def test___init__(): index = -1 fields = dict() analyzed_sentences = dict() doc = Document(index, fields, analyzed_sentences) assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences
def test_initial_code_to_run_on_document(): index = -1 fields = dict() analyzed_sentences = None doc = Document(index=index, fields=fields, analyzed_sentences=analyzed_sentences) assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences initial_code_to_run_on_document(doc) assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences
def test___repr__(): doc = Document(index=-1, fields=dict(), analyzed_sentences=dict()) assert repr( doc ) == "Document: {'index': -1, 'fields': {}, 'analyzed_sentences': {}}"
def test_preprocess(capsys): text_field = 'Test field' index = -1 fields = {text_field: 'Teste\r\nvalue with\ra\nfew tikens. ' * 2} analyzed_sentences1 = { text_field: [[{ 'form': 'Teste', 'lemma': 'teste', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tikens', 'lemma': 'tikens', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } analyzed_sentences2 = { text_field: [[{ 'form': 'Test', 'lemma': 'test', 'upostag': None }, { 'form': 'value', 'lemma': 'value', 'upostag': None }, { 'form': 'with', 'lemma': 'with', 'upostag': None }, { 'form': 'a', 'lemma': 'a', 'upostag': None }, { 'form': 'few', 'lemma': 'few', 'upostag': None }, { 'form': 'tokens', 'lemma': 'token', 'upostag': None }, { 'form': '.', 'lemma': '.', 'upostag': 'PUNCT' }]] * 2 } for spell_checker_lang, analyzed_sentences in [(None, analyzed_sentences1), ('en_US', analyzed_sentences2)]: doc = Document(index=index, fields=fields, analyzed_sentences=dict()) p = Preprocessor(spell_checker_lang=spell_checker_lang) assert p.stop is False p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) assert p.stop is False assert doc.index == index assert doc.fields == fields assert doc.analyzed_sentences == analyzed_sentences captured = capsys.readouterr() assert captured.out == '' assert captured.err[captured.err.rfind('\r') + 1:].startswith('Preprocessing: 100%|') assert captured.err.endswith('doc/s]\n') or captured.err.endswith( 's/doc]\n') p.stop = True with pytest.raises(SystemExit): p.preprocess(text_field=text_field, preprocessed_data_file=None, docs=[doc] * 2) del (p) if spell_checker_lang is not None: rmtree('./hunspell') docs = [ Document(index=index, fields=fields, analyzed_sentences=dict()) for index in range(2) ] preprocessed_data_file = utils.generate_available_filename() try: pickle_manager.dump_documents(docs, preprocessed_data_file) pickle_manager.check_data(preprocessed_data_file) p = Preprocessor(store_data=True) assert all([ doc.analyzed_sentences == dict() for doc in pickle_manager.get_documents(preprocessed_data_file) ]) p.preprocess(text_field, preprocessed_data_file, None) assert all([ doc.analyzed_sentences == analyzed_sentences1 for doc in pickle_manager.get_documents(preprocessed_data_file) ]) pickle_manager.check_data(preprocessed_data_file) finally: utils.remove_and_check(preprocessed_data_file)
def data_frame_to_document_list(data_frame): documents = [] for i in range(len(data_frame)): d = Document.from_data_frame(data_frame=data_frame, index=i) documents.append(d) return documents