train_datas = [{'monkey': 1, 'dog': 1, 'cat': 2, 'elephant': 4}, {'dog': 2, 'run': 5}] feature_hasher = FeatureHasher(n_features=2 ** 20, non_negative=True) train_datas = feature_hasher.transform(train_datas) """X = np.array([[1, 2, 4, 1, 1, 1], [3, 2, 4, 2, 2, 3], [2, 2, 3, 4, 4, 1], [2, 0, 3, 2, 3, 1], [2, 0, 0, 3, 3, 3], [2, 3, 1, 0, 3, 4]])""" class_label = np.array([1, 2]) # 调整平滑因子 clf = MultinomialNB(alpha=0.01) train = clf.fit(train_datas, class_label) test_datas = [{'monkey': 3, 'mouse': 1}] test_datas = feature_hasher.transform(test_datas) test = clf.predict(test_datas) print train_datas print test_datas print train print test print clf._joint_log_likelihood(test_datas) print clf.__dict__ # test metrics in sklearn y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 0, 1] print "macro:", precision_score(y_true, y_pred, average='macro') print "micro:", precision_score(y_true, y_pred, average='micro') print "none:", precision_score(y_true, y_pred, average=None)
'run': 5 }] feature_hasher = FeatureHasher(n_features=2**20, non_negative=True) train_datas = feature_hasher.transform(train_datas) """X = np.array([[1, 2, 4, 1, 1, 1], [3, 2, 4, 2, 2, 3], [2, 2, 3, 4, 4, 1], [2, 0, 3, 2, 3, 1], [2, 0, 0, 3, 3, 3], [2, 3, 1, 0, 3, 4]])""" class_label = np.array([1, 2]) # 调整平滑因子 clf = MultinomialNB(alpha=0.01) train = clf.fit(train_datas, class_label) test_datas = [{'monkey': 3, 'mouse': 1}] test_datas = feature_hasher.transform(test_datas) test = clf.predict(test_datas) print train_datas print test_datas print train print test print clf._joint_log_likelihood(test_datas) print clf.__dict__ # test metrics in sklearn y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 0, 1] print "macro:", precision_score(y_true, y_pred, average='macro') print "micro:", precision_score(y_true, y_pred, average='micro') print "none:", precision_score(y_true, y_pred, average=None)
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability', display_joint_log_likelihood=False, joint_log_likelihood_prefix='joint_log_likelihood'): if 'features' in model: feature_cols = model['features'] else: feature_cols = model['feature_cols'] feature_names, features = check_col_type(table, feature_cols) if 'nb_model' in model: nb_model = model['nb_model'] else: model_table = model['table_1'] if model_table.model_type[0] == 'multinomial': nb_model = MultinomialNB() else: nb_model = BernoulliNB() nb_model.fit([[1]], [1]) nb_model.classes_ = np.array([0, 1]) nb_model.class_log_prior_ = model_table.pi.values nb_model.feature_log_prob_ = np.array(list(model_table.theta)) prediction = nb_model.predict(features) if 'label_encoder' in model: label_encoder = model['label_encoder'] prediction = label_encoder.inverse_transform(prediction) if suffix == 'label': suffixes = label_encoder.classes_ else: suffixes = range(0, len(label_encoder.classes_)) else: suffixes = [0, 1] prob = nb_model.predict_proba(features) likelihood = nb_model._joint_log_likelihood(features) prob_cols = [ '{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) result = table result[prediction_col] = prediction if display_log_prob == True: log_prob = nb_model.predict_log_proba(features) logprob_cols = [ '{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes ] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) result = pd.concat([result, prob_df, logprob_df], axis=1) else: result = pd.concat([result, prob_df], axis=1) if display_joint_log_likelihood: likelihood_cols = [ '{prefix}_{suffix}'.format(prefix=joint_log_likelihood_prefix, suffix=suffix) for suffix in suffixes ] likelihood_df = pd.DataFrame(likelihood, columns=likelihood_cols) result = pd.concat([result, likelihood_df], axis=1) return {'out_table': result}