def validate_model(validate_data, columns, mentioned_clf, clfs): logger.info("Begin validate") content_validate = validate_data.iloc[:, 1] validate_data_segs = seg_words(content_validate) logger.debug("seg validate data done") scores = dict() predict = mentioned_clf.predict(validate_data_segs) predict = predict * -2 for column in columns: logger.debug("predict:%s", column) tmp_predict = predict.copy() for v_index, v_content_seg in enumerate(validate_data_segs): if tmp_predict[v_index] == 0: tmp_predict[v_index] = clfs[column].predict([v_content_seg]) report(validate_data[column], tmp_predict) score = f1_score(validate_data[column], tmp_predict, average='macro') scores[column] = score str_score = "\n" score = np.mean(list(scores.values())) for column in columns: str_score = str_score + column + ":" + str(scores[column]) + "\n" logger.info("f1_scores: %s\n" % str_score) logger.info("f1_score: %s" % score) logger.info("complete validate model")
def train_mentioned(): logger.info("########################################") logger.info("start train mentioned models") logger.info("########################################") # load train data train_data_df = load_data_from_csv(config.train_data_path) validate_data_df = load_data_from_csv(config.validate_data_path) content_train = train_data_df.iloc[0:config.train_data_size, 1] logger.debug("start seg train data") train_content_segs = seg_words(content_train) logger.debug("start seg validate data") content_validate = validate_data_df.iloc[0:, 1] validate_segs = seg_words(content_validate) logger.debug("load vectorizer") vectorizer_tfidf = joblib.load(config.model_save_path + vec_name) for model in models: train_mentioned_model(train_data_df, train_content_segs, validate_data_df, validate_segs, vectorizer_tfidf, model)
def vectorizer(): logger.info("start to vectorizer content") train_data = load_data_from_csv(config.train_data_path) content_segs = seg_words(train_data.iloc[0:, 1]) tf_idf = TfidfVectorizer(ngram_range=(1, 5), min_df=2, norm="l2", max_df=0.4, stop_words=stopwords) tf_idf.fit(content_segs) if not os.path.exists(config.model_save_path): os.makedirs(config.model_save_path) joblib.dump(tf_idf, config.model_save_path + vec_name, compress=True) logger.info("succes to save vectorizer")
def train_specific_model(train_data): columns = train_data.columns.values.tolist() logger.debug("begin to seg train content") content_segments = seg_words( train_data.content.iloc[0:config.train_data_size]) logger.debug("seg train content done") vectorizer = joblib.load(config.model_save_path + vec_name) logger.debug("load vectorizer") validate_data_df = load_data_from_csv(config.validate_data_path) validata_segs = seg_words(validate_data_df.content) logger.debug("seg validate content") scores = dict() for model_name in columns[:-1]: logger.info("begin to train %s model", model_name) cw = [{ -2: a, -1: b, 0: w, 1: x } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12) for x in range(5, 8)] # cw = {0: 7, 1: 6, -1: 6, -2: 1} positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) y_label = train_data[model_name].iloc[0:config.train_data_size] positive_clf.fit(content_segments, y_label) y_pre = positive_clf.predict(validata_segs) y_true = validate_data_df[model_name].iloc[0:] report(y_true, y_pre) score = f1_score(y_true, y_pre, average="macro") logger.info("score for model:%s is %s ", model_name, str(score)) scores[model_name] = score joblib.dump(positive_clf, config.model_save_path + model_name + ".pkl", compress=True) score = np.mean(list(scores.values())) logger.info("f1_scores: %s" % score)
args = parser.parse_args() model_name = args.model_name if not model_name: model_name = "model_dict.pkl" # load data logger.info("start load data...") test_data_df = load_data_from_csv(config.test_data_path) # load model logger.info("start load model...") classifier_dict = joblib.load(config.model_save_path + model_name) columns = test_data_df.columns.tolist() # seg words logger.info("start seg test data...") content_test = test_data_df.iloc[:, 1] content_test = seg_words(content_test) logger.info("complete seg test data.") # model predict logger.info("start predict test data...") for column in columns[2:]: test_data_df[column] = classifier_dict[column].predict(content_test) logger.info("compete %s predict." % column) test_data_df.to_csv(config.test_data_predict_out_path, encoding="utf_8_sig", index=False) logger.info("compete predict test data.")
help='the name of model') args = parser.parse_args() model_name = args.model_name if not model_name: model_name = "model_dict.pkl" # load train data logger.info("start load data") train_data_df = load_data_from_csv(config.train_data_path) validate_data_df = load_data_from_csv(config.validate_data_path) content_train = train_data_df.iloc[:, 1] logger.info("start seg train data") content_train = seg_words(content_train) logger.info("complete seg train data") columns = train_data_df.columns.values.tolist() logger.info("start train feature extraction") vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2') vectorizer_tfidf.fit(content_train) logger.info("complete train feature extraction models") logger.info("vocab shape: %s" % np.shape(vectorizer_tfidf.vocabulary_.keys())) # model train logger.info("start train model") classifier_dict = dict() for column in columns[2:]: label_train = train_data_df[column]
classifier_dict = dict() for column in columns[2:]: label_train = train_data_df[column] text_classifier = TextClassifier(vectorizer=vectorizer_tfidf) logger.info("start train %s model" % column) text_classifier.fit(content_train, label_train) logger.info("complete train %s model" % column) classifier_dict[column] = text_classifier logger.info("complete train model") # validate model content_validate = validate_data_df.iloc[:, 1] logger.info("start seg validate data") content_validate = seg_words(content_validate) logger.info("complete seg validate data") logger.info("start validate model") f1_score_dict = dict() for column in columns[2:]: label_validate = validate_data_df[column] text_classifier = classifier_dict[column] f1_score = text_classifier.get_f1_score(content_validate, label_validate) f1_score_dict[column] = f1_score f1_score = np.mean(list(f1_score_dict.values())) str_score = "\n" for column in columns[2:]: str_score = str_score + column + ":" + str(
#!/user/bin/env python # -*- coding:utf-8 -*- from data_process import load_data_from_csv, seg_words from sklearn.feature_extraction.text import TfidfVectorizer import config import numpy as np import pickle if __name__ == '__main__': # load train data train_data_df = load_data_from_csv(config.train_data_path) validate_data_df = load_data_from_csv(config.validate_data_path) content_train = train_data_df.iloc[:, 1] print("start seg train data") content_train = seg_words(content_train) print("complete seg train data") print("start train feature extraction") vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2') vectorizer_tfidf.fit(content_train) print("complete train feature extraction models") print("vocab shape: ") print(vectorizer_tfidf.vocabulary_.keys()) pickle.dump(vectorizer_tfidf, open("../model/vectorizer_tfidf.pkl", 'w'))