Esempio n. 1
0
def predict():
    global _text_field, _class_field, _preprocessor, _feature_extractor, _classifiers
    if not request.json:
        abort(BAD_REQUEST)
    text = request.json.get('text')
    classifier = request.json.get('classifier')
    if type(text) is not str:
        abort(BAD_REQUEST, 'Invalid text')
    if type(classifier) is not str:
        abort(BAD_REQUEST, 'Invalid classifier')
    if basename(classifier) != classifier:
        abort(BAD_REQUEST, 'Invalid classifier')
    doc = Document(index=-1, fields=dict({_text_field: text, _class_field: None}), analyzed_sentences=dict())
    _preprocessor.preprocess(text_field=_text_field, docs=[doc])
    corpus, classifications, _idxs_to_remove, docs_lemmas = _feature_extractor.prepare(text_field=_text_field, class_field=_class_field, docs=[doc], training_mode=False)
    X, _y = _feature_extractor.generate_X_y(corpus, classifications, training_mode=False)
    try:
        clf = _classifiers.get(classifier)
        if clf is None:
            clf = pickle_manager.load("%s.pkl" % classifier)
            _classifiers[classifier] = clf
        y_predict_proba = clf.predict_proba(X)
        probabilities = classifiers.predict_proba_to_dicts(clf.classes_, y_predict_proba)[0]
        feature_weights = get_feature_weights(clf, docs_lemmas[0])
        probabilities = DataFrame({'probabilities': probabilities}).to_dict('dict')
        return jsonify({**probabilities, **feature_weights})
    except FileNotFoundError:
        abort(BAD_REQUEST, 'Invalid classifier model')
Esempio n. 2
0
 def LatentDirichletAllocation(X,
                               y,
                               filename='LatentDirichletAllocation.pkl',
                               n_jobs=1):
     logger.info("Running %s." % (LatentDirichletAllocation.__name__))
     if exists(filename):
         lda = pickle_manager.load(filename)
         X = lda.transform(X)
     else:
         lda = LatentDirichletAllocation(n_components=10,
                                         doc_topic_prior=None,
                                         topic_word_prior=None,
                                         learning_method='batch',
                                         learning_decay=0.7,
                                         learning_offset=10.0,
                                         max_iter=10,
                                         batch_size=128,
                                         evaluate_every=-1,
                                         total_samples=1000000.0,
                                         perp_tol=0.1,
                                         mean_change_tol=0.001,
                                         max_doc_update_iter=100,
                                         n_jobs=n_jobs,
                                         verbose=0,
                                         random_state=random_state,
                                         n_topics=None)
         logger.debug("%s configuration: %s" %
                      (lda.__class__.__name__, lda.__dict__))
         X = lda.fit_transform(X, y)
         pickle_manager.dump(lda, filename)
     return X, y
Esempio n. 3
0
def test_dump_and_load():
    obj1 = random()
    try:
        path = create_temporary_file(content=None, text=False)
        pickle_manager.dump(obj1, path)
        obj2 = pickle_manager.load(path)
    finally:
        remove_and_check(path)
    assert obj1 == obj2
Esempio n. 4
0
def test_PickleDumpAppend___init__():
    metadata = {'total': 0}
    filename = generate_available_filename()
    try:
        not_dict = 'test_str'
        not_str = -1
        params = [[not_dict, filename], [metadata, not_str]]
        for m, f in params:
            with pytest.raises(AssertionError):
                pda = pickle_manager.PickleDumpAppend(m, f)
        pda = pickle_manager.PickleDumpAppend(metadata, filename)
        assert pda.filename_upon_completion == filename
        assert exists(pda.file.name)
        pda.close()
        assert pickle_manager.load(filename) == metadata
        assert not exists(pda.file.name)
        assert exists(filename)
    finally:
        remove_and_check(filename)
Esempio n. 5
0
def test_Pipeline_start():
    predict_probas_linux = {
        'RandomForestClassifier': [
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
        ],
        'BernoulliNB': [
            [1.0, 5.9253907982022474e-18, 9.24592247679012e-21],
            [5.086117678607322e-14, 0.9999999417850541, 5.821489476394197e-08],
        ],
        'MultinomialNB': [
            [1.0, 3.987155612430403e-87, 1.9843977254102716e-103],
            [1.1638109881136655e-141, 1.0, 4.902906597402722e-42],
        ],
        'ComplementNB': [
            [1.0, 1.244018908413837e-57, 2.372151728763692e-55],
            [1.2983800585685595e-35, 1.0, 3.836692075297123e-24],
        ],
        'KNeighborsClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'MLPClassifier': [
            [0.9999992330465266, 2.108350674827178e-08, 7.458699665987544e-07],
            [6.949799904570786e-10, 0.9999171940556058, 8.280524941418183e-05],
        ],
        'LinearSVC': [
            [0.8995782143576087, 0.02511044323694783, 0.07531134240544347],
            [0.03561932795252063, 0.9407083426933305, 0.023672329354149018],
        ],
        'DecisionTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'ExtraTreeClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'DummyClassifier': [[0, 0, 1], [1, 0, 0]],
        'SGDClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
        'BaggingClassifier': [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
    }
    predict_probas_windows = {
        'ComplementNB': [
            [1.0, 1.2440189084141198e-57, 2.3721517287642315e-55],
            [1.2983800585685595e-35, 1.0, 3.836692075297123e-24],
        ],
        'MLPClassifier': [
            [0.9999992330465266, 2.108350674827178e-08, 7.458699665987557e-07],
            [6.949799904570761e-10, 0.9999171940556058, 8.280524941418183e-05],
        ],
        'LinearSVC': [
            [0.8995692949536029, 0.025113499736912265, 0.07531720530948487],
            [0.0356197780956943, 0.9407082394988142, 0.02367198240549154],
        ],
    }
    p = classifiers.Pipeline(clfs)
    clfs_names = [f.__name__ for f in p.classifiers]
    clfs_files = ['%s.pkl' % (clf_name) for clf_name in clfs_names]
    roc_files = ['ROC_%s.png' % (clf_name) for clf_name in clfs_names]
    X, y = load_digits(n_class=3, return_X_y=True)
    y = y.tolist()
    assert all([not exists(clf_name) for clf_name in clfs_names])
    try:
        predictions = p.start(X, y, X, y, -1, {1, 2, 2, 4})
        for clf_name, clf_file in zip_longest(clfs_names, clfs_files):
            predict_proba = [list(d.values()) for d in predictions[clf_name][0:2]]
            assert np.array_equal(predict_probas_linux[clf_name], predict_proba) \
                    or np.array_equal(predict_probas_windows[clf_name], predict_proba)
            assert exists(clf_file)
            clf = pickle_manager.load(clf_file)
            if 'n_jobs' in dir(clf):
                assert clf.n_jobs == -1
            if 'class_weights' in dir(clf):
                assert clf.class_weights is None
        assert all([not exists(roc_file) for roc_file in roc_files])
        p.start(X, y, X, y, -1, {1, 2, 2, 4}, 'balanced')
        for clf_file in clfs_files:
            clf = pickle_manager.load(clf_file)
            if 'class_weights' in dir(clf):
                assert clf.class_weights == 'balanced'
        p.start(X, y, X, y, -1, {1, 2, 2, 4}, None, True)
        assert all([exists(roc_file) for roc_file in roc_files])
        classifiers.Pipeline([FailClassifier]).start(X, y, X, y)
        predictions = p.start(X, y)
        assert predictions == {'y_true': []}
        with pytest.raises(AssertionError):
            p.start(X, y, X, [])
    finally:
        for clf_file in clfs_files:
            remove_and_check(clf_file)
        for roc_file in roc_files:
            remove_and_check(roc_file)
Esempio n. 6
0
 def _get_vectorizer(vectorizer,
                     training_mode,
                     vectorizer_file="vectorizer.pkl"):
     token_pattern = r'\S+'
     if not training_mode and vectorizer not in [
             DocumentPoolEmbeddings.__name__
     ]:
         v = pickle_manager.load(vectorizer_file)
         assert vectorizer == v.__class__.__name__
     elif vectorizer == TfidfVectorizer.__name__:
         v = TfidfVectorizer(input='content',
                             encoding='utf-8',
                             decode_error='strict',
                             strip_accents=None,
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=None,
                             analyzer='word',
                             stop_words=[],
                             token_pattern=token_pattern,
                             ngram_range=(1, 1),
                             max_df=1.0,
                             min_df=1,
                             max_features=None,
                             vocabulary=None,
                             binary=False,
                             dtype=np.float64,
                             norm='l2',
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)
     elif vectorizer == CountVectorizer.__name__:
         v = CountVectorizer(input='content',
                             encoding='utf-8',
                             decode_error='strict',
                             strip_accents=None,
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=None,
                             stop_words=[],
                             token_pattern=token_pattern,
                             ngram_range=(1, 1),
                             analyzer='word',
                             max_df=1.0,
                             min_df=1,
                             max_features=None,
                             vocabulary=None,
                             binary=False,
                             dtype=np.int64)
     elif vectorizer == HashingVectorizer.__name__:
         v = HashingVectorizer(input='content',
                               encoding='utf-8',
                               decode_error='strict',
                               strip_accents=None,
                               lowercase=True,
                               preprocessor=None,
                               tokenizer=None,
                               stop_words=[],
                               token_pattern=token_pattern,
                               ngram_range=(1, 1),
                               analyzer='word',
                               n_features=1048576,
                               binary=False,
                               norm='l2',
                               alternate_sign=True,
                               non_negative=False,
                               dtype=np.float64)
     elif vectorizer == DocumentPoolEmbeddings.__name__:
         v = DocumentPoolEmbeddings(
             [BertEmbeddings('bert-base-multilingual-uncased')])
     else:
         raise ValueError("Invalid vectorizer: %s" % (vectorizer))
     return v