def main(num_epochs=100, n_splits=5): data_util = DataUtil('data', 'spectrogram_data') X, y = data_util.get_data() kf = KFold(n_splits=n_splits, shuffle=True) test_accuracy_sum = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = Model(data_util.height, data_util.width) param_values, threshold = train_and_validate(model, X_train, y_train, num_epochs) model.set_param_values(param_values) test_accuracy_sum += perform_validation(model, X_test, y_test, threshold) print("Cross-validation results:") print(" accuracy:\t\t{:.2f} %".format(test_accuracy_sum/n_splits * 100))
sys.exit(1) if len(sys.argv) > 2: layer_arg = int(sys.argv[2]) else: layer_arg = 2 if len(sys.argv) > 3: ep_arg = int(sys.argv[3]) else: ep_arg = 20 # Read the data print ">> Initializing data..." reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH) X, Y = reader.get_data() print X.shape print Y.shape # Train the model print ">> Training model... epochs = {0}, layers = {1}".format( ep_arg, layer_arg) nermodel = NERModel(reader) nermodel.train(epochs=ep_arg, layers=layer_arg) # Evaluate the model print ">> Evaluating model..." nermodel.evaluate() # Save the model print ">> Saving model..."
def train(self): self.system_setting.unlock_model_training() if self.system_setting.islock_model_training(): logging.warn("model training is locked.!!!") return try: self.system_setting.lock_model_training() logging.info("model training locked.....") # 训练模型 logging.info("********************************") logging.info("training classifier...........") dataUtil = DataUtil("articles_testN") origin_data = dataUtil.get_data() if origin_data is None: logging.warn("training data is NULL!!!") return wordVec = Word2Vector() embeddings = np.array(wordVec.embeddings) adam = Adagrad(lr=0.01, epsilon=1e-06) bestF = 0 bestAcc = 0 bestPre = 0 bestRecall = 0 for flod in range(self.flods): # 10 折中选出一个最好的 model = self.model(embeddings) model.compile(loss='binary_crossentropy', metrics=[ut.f_score], optimizer=adam) data = dataUtil.flod_cross_data(origin_data) test_data = data["test_data"] dev_data = data["dev_data"] train_data = data["train_data"] test_data["processed_content"] = sequence.pad_sequences(test_data['processed_content'],\ maxlen=self.content_max_len, padding='post', truncating='post') dev_data["processed_content"] = sequence.pad_sequences(dev_data["processed_content"], \ maxlen=self.content_max_len, padding='post', truncating='post') train_data["processed_content"] = sequence.pad_sequences(train_data["processed_content"],\ maxlen=self.content_max_len, padding='post', truncating='post') test_data["processed_title"] = sequence.pad_sequences(test_data['processed_title'],\ maxlen=self.title_max_len, padding='post', truncating='post') dev_data["processed_title"] = sequence.pad_sequences(dev_data["processed_title"], \ maxlen=self.title_max_len, padding='post', truncating='post') train_data["processed_title"] = sequence.pad_sequences(train_data["processed_title"],\ maxlen=self.title_max_len, padding='post', truncating='post') model = self.do_train(model, train_data, dev_data, test_data) result = model.predict_classes([ test_data["processed_content"], test_data["processed_title"] ], batch_size=self.batch_size, verbose=1) f_measure, pre, recall, acc = dr_evaluate( test_data["label"], result) logging.info("***********") logging.info("[flod] " + str(flod) + '] test F-measure:' + str(f_measure) + " test acc:" + str(acc)) logging.info("***********") if bestF < f_measure: bestF = f_measure bestAcc = acc bestPre = pre bestRecall = recall model.save_weights('cnn_model.h5') model = self.model(embeddings) model.compile(loss='binary_crossentropy', metrics=[ut.f_score], optimizer=adam) model.load_weights('cnn_model.h5') model.save_weights('news_classifier_model.h5') logging.info("###") logging.info('[** best result **] best F-measure:' + str(f_measure) + " best acc:" + str(acc)) logging.info("###") logging.info("********************************") except BaseException, e: logging.error(e)
if __name__ == "__main__": parser = argparse.ArgumentParser(description='cmod') parser.add_argument('-c', '--config', help='Config file path', required=True) cfg_parser = configparser.ConfigParser() args = parser.parse_args() cfg_parser.read(args.config) cfg = config.Config(cfg_parser) D = DataUtil (cfg) train_dataset = D.get_data('train') test_dataset = D.get_data('test') dev_dataset = D.get_data('dev') device = torch.device("cuda:0" if cfg.use_cuda() else "cpu") if cfg.sparse() and cfg.weight_decay() != 0: cfg.logger.error('Sparsity and weight decay are incompatible, pick one!') exit() torch.manual_seed(cfg.random_seed()) random.seed(cfg.random_seed()) if cfg.use_cuda(): torch.cuda.manual_seed(cfg.random_seed()) torch.backends.cudnn.benchmark = True
self.train() wordVec = Word2Vector() embeddings = np.array(wordVec.embeddings) model = self.model(embeddings) adam = Adagrad(lr=0.01, epsilon=1e-06) model.compile(loss='binary_crossentropy', metrics=[ut.f_score], optimizer=adam) model.load_weights('news_classifier_model.h5') dataUtil = DataUtil("articles_testN") pre_data = dataUtil.filter_data(data, 1) pre_data = dataUtil.transfer_form(pre_data) pre_data["processed_content"] = sequence.pad_sequences(pre_data['processed_content'],\ maxlen=self.content_max_len, padding='post', truncating='post') pre_data["processed_title"] = sequence.pad_sequences(pre_data['processed_title'],\ maxlen=self.title_max_len, padding='post', truncating='post') result = model.predict_classes([pre_data["processed_content"], \ pre_data["processed_title"]], batch_size=self.batch_size, verbose=1) #count = 0 for i in range(len(data)): data[i]["artitle_label"] = result[i][0] return data classifier = Classifier() dataUtil = DataUtil("articles_testN") origin_data = dataUtil.get_data() classifier.predict(origin_data['neg'])