def infer_classic(model_save_path, test_data_path, thresholds=0.5, pred_save_path=None, vectorizer_path=None, col_sep=',', num_classes=2, feature_type='tf'): # load model model = load_pkl(model_save_path) # load data content data_set, test_ids = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=vectorizer_path, is_infer=True) # get data feature data_feature = feature.get_feature() if num_classes == 2: # binary classification label_pred_probas = model.predict_proba(data_feature)[:, 1] label_pred = label_pred_probas > thresholds else: label_pred = model.predict(data_feature) save(label_pred, test_ids, pred_save_path) print("finish prediction.")
def train_xgboost_lr(data_path, vectorizer_path=None, xgblr_xgb_model_path=None, xgblr_lr_model_path=None, feature_encoder_path=None, feature_type='tfidf_char', col_sep='\t'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path, feature_encoder_path) # fit model.train_model(X_train, y_train) # evaluate label_pred = model.predict(X_val) simple_evaluate(y_val, label_pred)
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data=data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) # evaluate if true_labels: try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id)) except Exception: print("error. no true labels") # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight_dict = load_dict(config.lr_feature_weight_path) pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict, id_label) print(pred_labels[:5])
def infer_deep_model(model_type='cnn', data_path='', model_save_path='', label_vocab_path='', max_len=300, batch_size=128, col_sep='\t', pred_save_path=None): from keras.models import load_model # load data content data_set, true_labels = data_reader(data_path, col_sep) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': feature_type = 'doc_vectorize' else: feature_type = 'vectorize' feature = Feature(data_set, feature_type=feature_type, is_infer=True, max_len=max_len) # get data feature data_feature = feature.get_feature() # load model model = load_model(model_save_path) # predict, in keras, predict_proba same with predict pred_label_probs = model.predict(data_feature, batch_size=batch_size) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [prob.argmax() for prob in pred_label_probs] pred_labels = [id_label[i] for i in pred_labels] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to: %s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if true_labels: # evaluate assert len(pred_labels) == len(true_labels) for label, prob in zip(true_labels, pred_label_probs): logger.debug('label_true:%s\tprob_label:%s\tprob:%s' % (label, id_label[prob.argmax()], prob.max())) print('total eval:') try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def _get_feature(self, word_vocab): # 提取特征 print("feature_type : %s" % self.feature_type) print("seg_contents:") print(self.seg_contents[:2]) feature = Feature(data=self.seg_contents, feature_type=self.feature_type, feature_vec_path=self.feature_vec_path, word_vocab=word_vocab) # get data feature return feature.get_feature()
def infer_xgboost_lr(test_data_path, vectorizer_path=None, xgblr_xgb_model_path=None, xgblr_lr_model_path=None, feature_encoder_path=None, col_sep='\t', pred_save_path=None, feature_type='tfidf_char'): # load data content data_set, test_ids = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=vectorizer_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path, feature_encoder_path) # predict label_pred = model.predict(data_feature) save(label_pred, test_ids, pred_save_path) print("finish prediction.")
def train_classic(model_type, data_path=None, pr_figure_path=None, model_save_path=None, vectorizer_path=None, col_sep=',', thresholds=0.5, num_classes=2, feature_type='tfidf_char'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = get_model(model_type) # fit model.fit(X_train, y_train) # save model dump_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, thresholds=thresholds, num_classes=num_classes, model_type=model_type, pr_figure_path=pr_figure_path)
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=512, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' feature = Feature(data=data_content, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = cnn_model(max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_filters=num_filters, filter_sizes=filter_sizes, num_classses=num_classes, dropout=dropout) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) cp = ModelCheckpoint(model_save_path, monitor='val_acc', verbose=1, save_best_only=True) # fit and save model history = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_val, y_val), callbacks=[cp]) logger.info('save model:%s' % model_save_path) plt_history(history, model_name=model_type)
plt.legend(loc="best") plt.savefig(figure_path) return plt if __name__ == "__main__": sys.path.append("../") from models.feature import Feature from models.reader import data_reader data_content, data_lbl = data_reader('../data/train_words.txt', '\t') # init feature feature = Feature(feature_type='tfidf_word', feature_vec_path='../output/temp') # get data feature data_feature = feature.get_feature(data_content) # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.2) search_cv(X_train, y_train, X_val, y_val, model=SVC()) # test plot_learning_curve title = "Learning Curves (Random Forest, n_estimators = 30)" estimator = SVC() plot_learning_curve(estimator, title, X_train, y_train, cv=5,
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if 'logistic_regression' in model_save_path and config.is_debug: count = 0 features = load_pkl('output/lr_features.pkl') for line in data_set: if count > 5: break count += 1 logger.debug(line) words = line.split() for category, category_feature in features.items(): logger.debug('*' * 43) logger.debug(category) category_score = 0 for w in words: if w in category_feature: category_score += category_feature[w] logger.debug("%s:%s" % (w, category_feature[w])) logger.debug("%s\t%f" % (category, category_score)) logger.debug('=' * 43) if true_labels: # evaluate try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=2, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split(" ")) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' word_dic = {} count = 1 for word in word_vocab: word_dic[word] = count count += 1 data_filter = [] for line in data_content: line_filter = " ".join( list(filter(lambda x: x in word_dic, line.split(" ")))) data_filter.append(line_filter) feature = Feature(data=data_filter, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = load_model(model_save_path) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) #loss,accuracy = model.evaluate(X_val,y_val) #print loss,accuracy pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None) print(y_val) print(type(y_val)) with open("./output/result", "w") as f: for i in range(len(y_val)): f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2])) f.close()
label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.savefig(figure_path) return plt if __name__ == "__main__": from models.feature import Feature from models.reader import data_reader data_content, data_lbl = data_reader('../data/training_new_seg_10k.txt', '\t') # init feature feature = Feature(data_content, feature_type='tfidf_word', feature_vec_path='../output/temp') # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split( data_feature, data_label, test_size=0.2) search_cv(X_train, y_train, X_val, y_val, model=SVC()) # test plot_learning_curve title = "Learning Curves (Random Forest, n_estimators = 30)" estimator = SVC() plot_learning_curve(estimator, title, X_train, y_train, cv=5, n_jobs=4, figure_path='../output/curve.png') plt.show()
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type)) # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) word_id = load_vocab(word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) print(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) logger.info('data size:%d' % len(data_content)) logger.info('label size:%d' % len(data_lbl)) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.error('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split( data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': save_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path) # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight = {} word_dict_rev = sorted(word_id.items(), key=lambda x: x[1]) for feature, index in word_dict_rev: feature_weight[feature] = list(map(float, model.coef_[:, index])) save_dict(feature_weight, config.lr_feature_weight_path)