def train_model(): logger.info("########################################") logger.info("start train models") logger.info("########################################") train_data_df = load_data_from_csv(config.train_data_path) for model in models: data_to_train = filter_data(train_data_df, model) train_specific_model(data_to_train)
def validate(): validate_data_df = load_data_from_csv(config.validate_data_path) mentioned_clf = joblib.load(config.model_save_path + "traffic_mentioned.pkl") clfs = {} clfs['location_traffic_convenience'] = joblib.load( config.model_save_path + "location_traffic_convenience.pkl") validate_model(validate_data_df, ['location_traffic_convenience'], mentioned_clf, clfs)
def train_mentioned(): logger.info("########################################") logger.info("start train mentioned models") logger.info("########################################") # load train data train_data_df = load_data_from_csv(config.train_data_path) validate_data_df = load_data_from_csv(config.validate_data_path) content_train = train_data_df.iloc[0:config.train_data_size, 1] logger.debug("start seg train data") train_content_segs = seg_words(content_train) logger.debug("start seg validate data") content_validate = validate_data_df.iloc[0:, 1] validate_segs = seg_words(content_validate) logger.debug("load vectorizer") vectorizer_tfidf = joblib.load(config.model_save_path + vec_name) for model in models: train_mentioned_model(train_data_df, train_content_segs, validate_data_df, validate_segs, vectorizer_tfidf, model)
def vectorizer(): logger.info("start to vectorizer content") train_data = load_data_from_csv(config.train_data_path) content_segs = seg_words(train_data.iloc[0:, 1]) tf_idf = TfidfVectorizer(ngram_range=(1, 5), min_df=2, norm="l2", max_df=0.4, stop_words=stopwords) tf_idf.fit(content_segs) if not os.path.exists(config.model_save_path): os.makedirs(config.model_save_path) joblib.dump(tf_idf, config.model_save_path + vec_name, compress=True) logger.info("succes to save vectorizer")
def train_specific_model(train_data): columns = train_data.columns.values.tolist() logger.debug("begin to seg train content") content_segments = seg_words( train_data.content.iloc[0:config.train_data_size]) logger.debug("seg train content done") vectorizer = joblib.load(config.model_save_path + vec_name) logger.debug("load vectorizer") validate_data_df = load_data_from_csv(config.validate_data_path) validata_segs = seg_words(validate_data_df.content) logger.debug("seg validate content") scores = dict() for model_name in columns[:-1]: logger.info("begin to train %s model", model_name) cw = [{ -2: a, -1: b, 0: w, 1: x } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12) for x in range(5, 8)] # cw = {0: 7, 1: 6, -1: 6, -2: 1} positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) y_label = train_data[model_name].iloc[0:config.train_data_size] positive_clf.fit(content_segments, y_label) y_pre = positive_clf.predict(validata_segs) y_true = validate_data_df[model_name].iloc[0:] report(y_true, y_pre) score = f1_score(y_true, y_pre, average="macro") logger.info("score for model:%s is %s ", model_name, str(score)) scores[model_name] = score joblib.dump(positive_clf, config.model_save_path + model_name + ".pkl", compress=True) score = np.mean(list(scores.values())) logger.info("f1_scores: %s" % score)
parser = argparse.ArgumentParser() parser.add_argument('-mn', '--model_name', type=str, nargs='?', help='the name of model') args = parser.parse_args() model_name = args.model_name if not model_name: model_name = "model_dict.pkl" # load data logger.info("start load data...") test_data_df = load_data_from_csv(config.test_data_path) # load model logger.info("start load model...") classifier_dict = joblib.load(config.model_save_path + model_name) columns = test_data_df.columns.tolist() # seg words logger.info("start seg test data...") content_test = test_data_df.iloc[:, 1] content_test = seg_words(content_test) logger.info("complete seg test data.") # model predict logger.info("start predict test data...") for column in columns[2:]:
is_test = True if args.test is None: is_test = True else: is_test = False if args.test == 0 else True load_cache = True if args.load_cache is None: load_cache = True else: load_cache = False if args.load_cache == 0 else True # load data test_num = 100 if is_test else None logger.info("start load data, try read {0} records, test mode {1}".format(test_num, is_test)) test_data_df = load_data_from_csv(config.test_data_path, nrow=test_num) # load embedding matrix embedding_matrix = load_data("emb.npy") # load vocab vocab = load_data("vocab.npy").tolist() # load all test columns columns = test_data_df.columns.tolist() # seg content words to sequence logger.info("start seg test data, let's look at some data") logger.info(test_data_df.iloc[1, :]) content_test = test_data_df.iloc[:, 1] if not load_cache:
logger = logging.getLogger(__name__) if __name__ == '__main__': # parser = argparse.ArgumentParser() # parser.add_argument('-mn', '--model_name', type=str, nargs='?', # help='the name of model') # # args = parser.parse_args() # model_name = args.model_name # if not model_name: # model_name = "model_dict.pkl" # load train data logger.info("start load data") train_data_df = load_data_from_csv(config.train_data_path) validate_data_df = load_data_from_csv(config.validate_data_path) content_train = train_data_df.iloc[:, 1] content_validate = validate_data_df.iloc[:,1] # vocab 大小 max_document_length = max([len(x.split(" ")) for x in content_train]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) content_train_numeric = np.array(list(vocab_processor.fit_transform(content_train))) content_validate_numeric = np.array(list(vocab_processor.fit_transform(content_validate))) logger.info("start seg train data") logger.info("complete seg train data") columns = train_data_df.columns.values.tolist()
# a = np.array(s) # print("save ", a) # np.save("a.npy", a) # b = np.load("a.npy") # print("load ", b) # vocab = { u"你好" : 0, u"朋友" : 1, u"人" : 2 , u"年":3, u"一个":4} # embedding_matrix = get_embeding_weights(vocab, config.word2vec_path, 1000000) # print(vocab) # print(embedding_matrix) # load train data and validate data logger.info("start load data") traing_num = 10000 if is_test else None validate_num = 5000 if is_test else None train_data_df = load_data_from_csv(config.train_data_path, nrow=traing_num) validate_data_df = load_data_from_csv(config.validate_data_path, nrow=validate_num) # get all train sentences content_train = train_data_df.iloc[:, 1] logger.info(content_train[0]) logger.info(content_train[1]) logger.info("start seg train sentences to vector") if not load_cache: max_len, word, vocab, sequences = sentences_to_indices(content_train) save_data(vocab, "all_vocab.npy") save_data(word, "word.npy") # save_data(sequences, "seq.npy")