def main(): config = Config() print("loading data...") ids, data, labels = bd.load_data("./corpus/seg_train.txt") count, dict_word2index, dict_index2word = bd.build_vocabulary(data, min_count=config.min_count) print("save word2index and index2word") bde.save_dict(dict_word2index,config.word2index_path) bde.save_dict(dict_index2word,config.index2word_path) print("load word2index and index2word") print(bde.load_pickle(config.word2index_path)) print(bde.load_pickle(config.index2word_path))
def main(task2_model_id, task2_model_path): multi_config = MultiConfig() multi_config.is_training = False multi_config.dropout_rate = 0.0 print("loading data...") dict_word2index = bpe.load_pickle(multi_config.word2index_path) tests_id, test_data = bd.load_test_data(multi_config.test_path) if task2_model_id != 4: test_X = bd.build_test_data(test_data, dict_word2index, multi_config.max_text_len) else: test_X = bd.build_test_data_HAN(test_data, dict_word2index, multi_config.num_sentences, multi_config.sequence_length) testset = MingLueTestData(test_X) test_loader = DataLoader(dataset=testset, batch_size=multi_config.batch_size, shuffle=False, num_workers=multi_config.num_workers) multi_config.vocab_size = len(dict_word2index) print("loading model...") model2 = load_multi_model(task2_model_path, task2_model_id, multi_config) print("model loaded") print("predicting...") predicted_multi_labels = [[]] predicted_multi_labels = predict_multi_label(test_loader, model2, multi_config) generate_result_json(tests_id, predicted_multi_labels, multi_config.result_path)
def main(task1_model_id, task1_model_path): config = Config() multi_config = MultiConfig() config.is_training = False config.dropout_rate = 0.0 multi_config.is_training = False multi_config.dropout_rate = 0.0 # model_id = int(input("Please select a model(input model id):\n0: fastText\n1: TextCNN\n2: TextRCNN\nInput: ")) print("loading data...") dict_word2index = bpe.load_pickle(config.word2index_path) if task1_model_id != 4: tests_id, test_data = bd.load_test_data(config.test_path) test_X = bd.build_test_data(test_data, dict_word2index, config.max_text_len) else: tests_id, test_data = bd.load_test_data(config.test_path) test_X = bd.build_test_data_HAN(test_data, dict_word2index, config.num_sentences, config.sequence_length) testset = MingLueTestData(test_X) test_loader = DataLoader(dataset=testset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) config.vocab_size = len(dict_word2index) multi_config.vocab_size = len(dict_word2index) print("loading model...") model1 = load_model(task1_model_path, task1_model_id, config) print("model loaded") print("predicting...") predicted_labels = predict(test_loader, model1, config.has_cuda) predicted_multi_labels = [[]] generate_result_json(tests_id, predicted_labels, predicted_multi_labels, config.result_path)
def main(rcnn_model_path, han_model_path): config = Config() config.is_training = False config.dropout_rate = 0.0 print("loading data...") dict_word2index = bpe.load_pickle(config.word2index_path) tests_id, test_data = bd.load_test_data(config.test_path) test_X = bd.build_test_data(test_data, dict_word2index, config.max_text_len) testset = MingLueTestData(test_X) test_loader = DataLoader(dataset=testset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) test_X_HAN = bd.build_test_data_HAN(test_data, dict_word2index, config.num_sentences, config.sequence_length) testset = MingLueTestData(test_X_HAN) test_loader_HAN = DataLoader(dataset=testset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) config.vocab_size = len(dict_word2index) print("loading model...") rcnn_model = load_model(rcnn_model_path, 2, config) han_model = load_model(han_model_path, 4, config) print("model loaded") print("predicting...") predicted_labels = predict(test_loader, test_loader_HAN, rcnn_model, han_model, config) generate_result_json(tests_id, predicted_labels, config.result_path)
def main(model_id, use_element, is_save): config = Config() print("epoch num: ", config.epoch_num) print("loading data...") ids, data, labels = bd.load_data(config.data_path) total_vocab_size = sd.count_vocab_size(data) print("total vocab size", total_vocab_size) force = config.force_word2index if not force and os.path.exists(config.index2word_path) and os.path.exists( config.word2index_path): print("load word2index") dict_word2index = bpe.load_pickle(config.word2index_path) print(dict_word2index['<UNK>'], dict_word2index['<PAD>']) else: print("save word2index and index2word") count, dict_word2index, dict_index2word = bd.build_vocabulary( data, min_count=config.min_count) bpe.save_dict(dict_index2word, config.index2word_path) bpe.save_dict(dict_word2index, config.word2index_path) return if is_save == 'y': if model_id == 4: print("save HAN...") train_data, train_labels = bd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length) print(np.shape(train_data), np.shape(train_labels)) print(len(ids)) dataset = MingLueData(ids, train_data, train_labels) else: if model_id == 4: train_data, train_labels = bd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length) train_ids, valid_ids = bd.split_data(ids, radio=0.9) train_X, valid_X = bd.split_data(train_data, radio=0.9) train_y, valid_y = bd.split_data(train_labels, radio=0.9) print("trainset size:", len(train_ids)) print("validset size:", len(valid_ids)) dataset = MingLueData(train_ids, train_X, train_y) del data batch_size = config.batch_size if model_id == 4: batch_size = config.han_batch_size train_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=True, num_workers=config.num_workers) if is_save != 'y': dataset = MingLueData(valid_ids, valid_X, valid_y) valid_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=False, num_workers=config.num_workers) print("data loaded") config.vocab_size = len(dict_word2index) print('config vocab size:', config.vocab_size) model = model_selector(config, model_id, use_element) if config.has_cuda: model = model.cuda() loss_weight = torch.FloatTensor(config.loss_weight_value) loss_weight = loss_weight + 1 - loss_weight.mean() print("loss weight:", loss_weight) loss_fun = nn.CrossEntropyLoss(loss_weight.cuda()) optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) print("training...") weight_count = 0 max_score = 0 total_loss_weight = torch.FloatTensor(torch.zeros(8)) for epoch in range(config.epoch_num): print("lr:", config.learning_rate, "lr2:", config.learning_rate2) running_loss = 0.0 running_acc = 0.0 for i, data in enumerate(train_loader, 0): ids, texts, labels = data if config.has_cuda: inputs, labels = Variable(texts.cuda()), Variable( labels.cuda()) else: inputs, labels = Variable(texts), Variable(labels) optimizer.zero_grad() outputs = model(inputs) loss = loss_fun(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data[0] if i % config.step == config.step - 1: if epoch % config.epoch_step == config.epoch_step - 1: _, predicted = torch.max(outputs.data, 1) predicted = predicted.cpu().numpy().tolist() running_acc = accuracy(predicted, labels.data.cpu().numpy()) print('[%d, %5d] loss: %.3f, acc: %.3f' % (epoch + 1, i + 1, running_loss / config.step, running_acc)) running_loss = 0.0 if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1: print("predicting...") loss_weight, score = do_eval(valid_loader, model, model_id, config.has_cuda) if score >= 0.478 and score > max_score: max_score = score save_path = config.model_path + "." + str( score) + "." + config.model_names[model_id] torch.save(model.state_dict(), save_path) if epoch >= 3: weight_count += 1 total_loss_weight += loss_weight print("avg_loss_weight:", total_loss_weight / weight_count) if epoch >= config.begin_epoch - 1: if epoch >= config.begin_epoch and config.learning_rate2 == 0: config.learning_rate2 = 2e-4 elif config.learning_rate2 > 0: config.learning_rate2 *= config.lr_decay if config.learning_rate2 <= 1e-5: config.learning_rate2 = 1e-5 config.learning_rate = config.learning_rate * config.lr_decay optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) time_stamp = str(int(time.time())) if is_save == "y": if use_element: save_path = config.model_path + "." + time_stamp + ".use_element." + config.model_names[ model_id] else: save_path = config.model_path + "." + time_stamp + "." + config.model_names[ model_id] torch.save(model.state_dict(), save_path) else: print("not save")
def main(model_id, use_element, is_save): config = MultiConfig() print("epoch num", config.epoch_num) config.use_element = use_element print("loading data...") ids, data, labels = bmd.load_data(config.data_path) # sd.show_text_len_distribution(data) # sd.show_label_text_len_distribution(labels, data) total_vocab_size = sd.count_vocab_size(data) print("total vocab size", total_vocab_size) force = config.force_word2index if not force and os.path.exists(config.index2word_path) and os.path.exists( config.word2index_path): print("load word2index") dict_word2index = bpe.load_pickle(config.word2index_path) else: print("save word2index and index2word") count, dict_word2index, dict_index2word = bmd.build_vocabulary( data, min_count=config.min_count) bpe.save_dict(dict_index2word, config.index2word_path) bpe.save_dict(dict_word2index, config.word2index_path) return # train_ids, train_X, train_y = bd.over_sample(train_ids, train_X, train_y) # print(train_y.shape[0], Counter(train_y)) if is_save == 'y': if model_id != 4: all_train_ids, all_train_X, all_train_y = bmd.build_dataset( ids, data, labels, dict_word2index, config.max_text_len, config.num_class) dataset = MingLueMultiData(all_train_ids, all_train_X, all_train_y) # dataset = MingLueMultiData(valid_ids, valid_X, valid_y) else: train_data, train_labels = bmd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length, num_class=config.num_class) print("save HAN...") dataset = MingLueMultiData(ids, train_data, train_labels) print(np.shape(train_data), np.shape(train_labels)) print(len(ids)) else: if model_id == 4: train_data, train_labels = bmd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length, num_class=config.num_class) train_ids, valid_ids = bmd.split_data(ids, radio=0.9) train_X, valid_X = bmd.split_data(train_data, radio=0.9) train_y, valid_y = bmd.split_data(train_labels, radio=0.9) else: train_ids, valid_ids = bmd.split_data(ids, radio=0.9) train_data, valid_data = bmd.split_data(data, radio=0.9) train_labels, valid_labels = bmd.split_data(labels, radio=0.9) train_ids, train_X, train_y = bmd.build_dataset( train_ids, train_data, train_labels, dict_word2index, config.max_text_len, config.num_class) valid_ids, valid_X, valid_y = bmd.build_dataset( valid_ids, valid_data, valid_labels, dict_word2index, config.max_text_len, config.num_class) print("trainset size:", len(train_ids)) print("validset size:", len(valid_ids)) dataset = MingLueMultiData(train_ids, train_X, train_y) batch_size = config.batch_size if model_id == 4: batch_size = config.han_batch_size del data train_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=True, num_workers=config.num_workers) if is_save != 'y': dataset = MingLueMultiData(valid_ids, valid_X, valid_y) valid_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=False, num_workers=config.num_workers) if model_id == 5 or model_id == 6: # cnn and rcnn with doc2vec dmpv_model, dbow_model = gdv.load_doc2vec_model( config.dmpv_model_path, config.dbow_model_path) print("data loaded") config.vocab_size = len(dict_word2index) print('config vocab size:', config.vocab_size) model = model_selector(config, model_id, use_element) if config.has_cuda: model = model.cuda() if use_element: all_element_vector = bpe.load_pickle(config.element_vector_path) loss_weight = torch.FloatTensor(config.loss_weight) print(loss_weight.mean()) loss_weight = 1 + 2 * (loss_weight.mean() - loss_weight) #loss_fun = nn.MultiLabelSoftMarginLoss(loss_weight.cuda()) loss_fun = nn.MultiLabelSoftMarginLoss() # optimizer = optim.Adam(model.parameters(),lr=config.learning_rate, weight_decay=config.weight_decay) optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) print("training...") weight_count = 0 max_score = 0 for epoch in range(config.epoch_num): print("lr:", config.learning_rate, "lr2:", config.learning_rate2) running_loss = 0.0 running_jaccard = 0.0 for i, data in enumerate(train_loader, 0): ids, texts, labels = data # TODO if model_id == 4: pass if config.has_cuda: inputs, labels = Variable(texts.cuda()), Variable( labels.cuda()) else: inputs, labels = Variable(texts), Variable(labels) optimizer.zero_grad() if model_id == 5 or model_id == 6: # cnn and rcnn with doc2vec doc2vec = gdv.build_doc2vec(ids, dmpv_model, dbow_model) if config.has_cuda: doc2vec = Variable(torch.FloatTensor(doc2vec).cuda()) else: doc2vec = Variable(torch.FloatTensor(doc2vec)) # [batch_size, (doc2vec_size*2)] # print(doc2vec.size()) outputs = model(inputs, doc2vec) elif use_element: element_vec = build_element_vec(ids, all_element_vector) if config.has_cuda: element_vec = Variable( torch.LongTensor(element_vec).cuda()) else: element_vec = Variable(torch.LongTensor(element_vec)) outputs = model(inputs, element_vec) else: outputs = model(inputs) loss = loss_fun(outputs, labels.float()) # or weight *labels.float() loss.backward() optimizer.step() running_loss += loss.data[0] if i % config.step == config.step - 1: if epoch % config.epoch_step == config.epoch_step - 1: predicted_labels = get_multi_label_from_output( outputs, config) true_label = labels.data.cpu().numpy() rows, true_label = np.where(true_label == 1) true_label = where_result_reshape(outputs.size()[0], rows, true_label) running_jaccard = cs.jaccard(predicted_labels, true_label) print('[%d, %5d] loss: %.3f, jaccard: %.3f' % (epoch + 1, i + 1, running_loss / config.step, running_jaccard)) running_loss = 0.0 if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1: print("predicting...") if model_id == 5 or model_id == 6: score = do_eval(valid_loader, model, model_id, config, dmpv_model, dbow_model) else: score = do_eval(valid_loader, model, model_id, config) if epoch >= 5: config.max_prob = 0.55 print("max prob:", config.max_prob) score_2 = do_eval(valid_loader, model, model_id, config) config.max_prob = 0.45 print("max prob:", config.max_prob) score_3 = do_eval(valid_loader, model, model_id, config) if score >= 0.788 and score > max_score: max_score = score save_path = config.model_path + "." + str( score) + ".multi." + config.model_names[model_id] torch.save(model.state_dict(), save_path) if epoch >= 3: weight_count += 1 # total_loss_weight += loss_weight # print("avg_loss_weight:",total_loss_weight/weight_count) if epoch >= config.begin_epoch - 1: if epoch >= config.begin_epoch and config.learning_rate2 == 0: config.learning_rate2 = 2e-4 elif config.learning_rate2 > 0: config.learning_rate2 *= config.lr_decay if config.learning_rate2 <= 1e-5: config.learning_rate2 = 1e-5 config.learning_rate = config.learning_rate * config.lr_decay optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) time_stamp = str(int(time.time())) if is_save == "y": if use_element: save_path = config.model_path + "." + time_stamp + ".multi.use_element." + config.model_names[ model_id] else: save_path = config.model_path + "." + time_stamp + ".multi." + config.model_names[ model_id] torch.save(model.state_dict(), save_path) else: print("not save")
stopwords_data_path = "data/stopwords.txt" # stopwords_data_df = pd.read_csv(stopwords_data_path, encoding="utf-8", sep="\t", index_col=None, quoting=3,names=["stopword"]) # add <PAD> for embedding word_count = [('<UNK>', -1), ('<PAD>', -1)] words = [] for word in data: words.append(word) counter = Counter(words) counter_list = counter.most_common() for word, count in counter_list: if count >= min_count: word_count.append((word, count)) dict_word2index = dict() for word, _ in word_count: dict_word2index[word] = len(dict_word2index) dict_index2word = dict( zip(dict_word2index.values(), dict_word2index.keys())) print("vocab size:", len(word_count)) return word_count, dict_word2index, dict_index2word if __name__ == "__main__": config = Config() data = ['公诉', '机关', '莆田市', '荔城区', '荔城区', '荔城区'] word_count, dict_word2index, dict_index2word = build_vocabulary(data) bde.save_dict(dict_word2index, config.word2index_path) bde.save_dict(dict_index2word, config.index2word_path) print(bde.load_pickle(config.word2index_path)) print(bde.load_pickle(config.index2word_path))
def main(model_id, use_element, is_save): config = Config() print("epoch num: ", config.epoch_num) config.use_element = use_element print("loading data...") # 原始数据 切分3列-list id data label ids, data, labels = bd.load_data(config.data_path) train_ids, valid_ids = bd.split_data(ids, radio=0.7) train_data, valid_data = bd.split_data(data, radio=0.7) train_labels, valid_labels = bd.split_data(labels, radio=0.7) # 求数据中所有词汇个数 total_vocab_size = sd.count_vocab_size(data) print("total vocab size", total_vocab_size) print("load word2index") dict_word2index = bpe.load_pickle(config.word2index_path) # print(len(dict_word2index)) train_ids, train_X, train_y = bd.build_dataset( train_ids, train_data, train_labels, dict_word2index, max_text_len=config.max_text_len) print(train_ids[0:4]) print(train_X[0:4]) print(train_y[0:4]) valid_ids, valid_X, valid_y = bd.build_dataset( valid_ids, valid_data, valid_labels, dict_word2index, max_text_len=config.max_text_len) print("trainset size:", len(train_ids)) print("validset size:", len(valid_ids)) dataset_train = MingLueData(train_ids, train_X, train_y) dataset_valid = MingLueData(valid_ids, valid_X, valid_y) batch_size = config.batch_size train_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, num_workers=config.num_workers) valid_loader = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=True, num_workers=config.num_workers) config.vocab_size = len(dict_word2index) print('config vocab size:', config.vocab_size) model = model_selector(config, model_id, use_element) if config.has_cuda: model = model.cuda() loss_weight = torch.FloatTensor(config.loss_weight_value) loss_weight = loss_weight + 1 - loss_weight.mean() print("loss weight:", loss_weight) loss_fun = nn.CrossEntropyLoss(loss_weight.cuda()) optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) print("training...") weight_count = 0 max_score = 0 total_loss_weight = torch.FloatTensor(torch.zeros(8)) for epoch in range(config.epoch_num): print("lr:", config.learning_rate, "lr2:", config.learning_rate2) running_loss = 0.0 for i, data in enumerate(train_loader, 0): ids, texts, labels = data if config.has_cuda: inputs, labels = Variable(texts.cuda()), Variable( labels.cuda()) else: inputs, labels = Variable(texts), Variable(labels) optimizer.zero_grad() outputs = model(inputs) loss = loss_fun(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data[0] if i % config.step == config.step - 1: if epoch % config.epoch_step == config.epoch_step - 1: _, predicted = torch.max(outputs.data, 1) predicted = predicted.cpu().numpy().tolist() running_acc = accuracy(predicted, labels.data.cpu().numpy()) print('[%d, %5d] loss: %.3f, acc: %.3f' % (epoch + 1, i + 1, running_loss / config.step, running_acc)) running_loss = 0.0 if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1: print("predicting...") loss_weight, score = do_eval(valid_loader, model, model_id, config.has_cuda) if score >= 0.478 and score > max_score: max_score = score save_path = config.model_path + "." + str( score) + "." + config.model_names[model_id] torch.save(model.state_dict(), save_path) if epoch >= 3: weight_count += 1 total_loss_weight += loss_weight print("avg_loss_weight:", total_loss_weight / weight_count) if epoch >= config.begin_epoch - 1: if epoch >= config.begin_epoch and config.learning_rate2 == 0: config.learning_rate2 = 2e-4 elif config.learning_rate2 > 0: config.learning_rate2 *= config.lr_decay if config.learning_rate2 <= 1e-5: config.learning_rate2 = 1e-5 config.learning_rate = config.learning_rate * config.lr_decay optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay)
def main(model_id, use_element, is_save): config = Config() print("epoch num: ", config.epoch_num) config.use_element = use_element # model_id = int(input("Please select a model(input model id):\n0: fastText\n1: TextCNN\n2: TextRCNN\n4: HAN\nInput: ")) # is_save = input("Save Model?(y/n): ") print("loading data...") ids, data, labels = bd.load_data(config.data_path) # sd.show_text_len_distribution(data) # sd.show_label_text_len_distribution(labels, data) total_vocab_size = sd.count_vocab_size(data) print("total vocab size", total_vocab_size) force = config.force_word2index if not force and os.path.exists(config.index2word_path) and os.path.exists( config.word2index_path): print("load word2index") dict_word2index = bpe.load_pickle(config.word2index_path) print(dict_word2index['<UNK>'], dict_word2index['<PAD>']) else: print("save word2index and index2word") count, dict_word2index, dict_index2word = bd.build_vocabulary( data, min_count=config.min_count) bpe.save_dict(dict_index2word, config.index2word_path) bpe.save_dict(dict_word2index, config.word2index_path) return # train_ids, train_X, train_y = bd.over_sample(train_ids, train_X, train_y) # print(train_y.shape[0], Counter(train_y)) if is_save == 'y': if model_id != 4: all_train_ids, all_train_X, all_train_y = bd.build_dataset_over_sample( ids, data, labels, dict_word2index, config.max_text_len) dataset = MingLueData(all_train_ids, all_train_X, all_train_y) else: print("save HAN...") train_data, train_labels = bd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length) print(np.shape(train_data), np.shape(train_labels)) print(len(ids)) dataset = MingLueData(ids, train_data, train_labels) else: if model_id == 4: train_data, train_labels = bd.build_data_set_HAN( data, labels, dict_word2index, num_sentences=config.num_sentences, sequence_length=config.sequence_length) train_ids, valid_ids = bd.split_data(ids, radio=0.9) train_X, valid_X = bd.split_data(train_data, radio=0.9) train_y, valid_y = bd.split_data(train_labels, radio=0.9) else: train_ids, valid_ids = bd.split_data(ids, radio=0.9) train_data, valid_data = bd.split_data(data, radio=0.9) train_labels, valid_labels = bd.split_data(labels, radio=0.9) # over sample for train data train_ids, train_X, train_y = bd.build_dataset_over_sample( train_ids, train_data, train_labels, dict_word2index, config.max_text_len) valid_ids, valid_X, valid_y = bd.build_dataset( valid_ids, valid_data, valid_labels, dict_word2index, config.max_text_len) print("trainset size:", len(train_ids)) print("validset size:", len(valid_ids)) dataset = MingLueData(train_ids, train_X, train_y) del data batch_size = config.batch_size if model_id == 4: batch_size = config.han_batch_size train_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=True, num_workers=config.num_workers) if is_save != 'y': dataset = MingLueData(valid_ids, valid_X, valid_y) valid_loader = DataLoader( dataset=dataset, batch_size=batch_size, # 更改便于为不同模型传递不同batch shuffle=False, num_workers=config.num_workers) if model_id == 5 or model_id == 6: # cnn and rcnn with doc2vec dmpv_model, dbow_model = gdv.load_doc2vec_model( config.dmpv_model_path, config.dbow_model_path) print("data loaded") config.vocab_size = len(dict_word2index) print('config vocab size:', config.vocab_size) model = model_selector(config, model_id, use_element) if config.has_cuda: model = model.cuda() if use_element: all_element_vector = bpe.load_pickle(config.element_vector_path) loss_weight = torch.FloatTensor(config.loss_weight_value) loss_weight = loss_weight + 1 - loss_weight.mean() print("loss weight:", loss_weight) loss_fun = nn.CrossEntropyLoss(loss_weight.cuda()) # loss_fun = nn.CrossEntropyLoss() # optimizer = optim.Adam(model.parameters(),lr=config.learning_rate, weight_decay=config.weight_decay) optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) print("training...") weight_count = 0 max_score = 0 total_loss_weight = torch.FloatTensor(torch.zeros(8)) for epoch in range(config.epoch_num): print("lr:", config.learning_rate, "lr2:", config.learning_rate2) running_loss = 0.0 running_acc = 0.0 for i, data in enumerate(train_loader, 0): ids, texts, labels = data # TODO if model_id == 4: pass if config.has_cuda: inputs, labels = Variable(texts.cuda()), Variable( labels.cuda()) else: inputs, labels = Variable(texts), Variable(labels) optimizer.zero_grad() if model_id == 5 or model_id == 6: # cnn and rcnn with doc2vec doc2vec = gdv.build_doc2vec(ids, dmpv_model, dbow_model) if config.has_cuda: doc2vec = Variable(torch.FloatTensor(doc2vec).cuda()) else: doc2vec = Variable(torch.FloatTensor(doc2vec)) # [batch_size, (doc2vec_size*2)] # print(doc2vec.size()) outputs = model(inputs, doc2vec) elif use_element: element_vec = build_element_vec(ids, all_element_vector) if config.has_cuda: element_vec = Variable( torch.LongTensor(element_vec).cuda()) else: element_vec = Variable(torch.LongTensor(element_vec)) outputs = model(inputs, element_vec) else: outputs = model(inputs) loss = loss_fun(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data[0] if i % config.step == config.step - 1: if epoch % config.epoch_step == config.epoch_step - 1: _, predicted = torch.max(outputs.data, 1) predicted = predicted.cpu().numpy().tolist() # predicted = [i[0] for i in predicted] running_acc = accuracy(predicted, labels.data.cpu().numpy()) print('[%d, %5d] loss: %.3f, acc: %.3f' % (epoch + 1, i + 1, running_loss / config.step, running_acc)) running_loss = 0.0 if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1: print("predicting...") if model_id == 5 or model_id == 6: loss_weight, score = do_eval(valid_loader, model, model_id, config.has_cuda, dmpv_model, dbow_model) else: loss_weight, score = do_eval(valid_loader, model, model_id, config.has_cuda) if score >= 0.478 and score > max_score: max_score = score save_path = config.model_path + "." + str( score) + "." + config.model_names[model_id] torch.save(model.state_dict(), save_path) if epoch >= 3: weight_count += 1 total_loss_weight += loss_weight print("avg_loss_weight:", total_loss_weight / weight_count) if epoch >= config.begin_epoch - 1: if epoch >= config.begin_epoch and config.learning_rate2 == 0: config.learning_rate2 = 2e-4 elif config.learning_rate2 > 0: config.learning_rate2 *= config.lr_decay if config.learning_rate2 <= 1e-5: config.learning_rate2 = 1e-5 config.learning_rate = config.learning_rate * config.lr_decay optimizer = model.get_optimizer(config.learning_rate, config.learning_rate2, config.weight_decay) time_stamp = str(int(time.time())) if is_save == "y": if use_element: save_path = config.model_path + "." + time_stamp + ".use_element." + config.model_names[ model_id] else: save_path = config.model_path + "." + time_stamp + "." + config.model_names[ model_id] torch.save(model.state_dict(), save_path) else: print("not save")