def train(train_set, val_set, data_info, params, embedding_size=300): # print("loading word2vec...") pretrained_w2v = loadpickle('/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/selftrained_extracted_w2v_wordnet_synsets_py3.pl') # pretrained_w2v = loadpickle('/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/googlenews_extracted_w2v_wordnet_synsets_py3.pl') # wv_matrix = [] words_not_found = [] for i in range(len(data_info["vocab"])): word = data_info["idx2tag"][i] if word in pretrained_w2v: wv_matrix.append(pretrained_w2v[word]) else: words_not_found.append(word) # print("{} not found in dictrionary, will use random".format(word)) wv_matrix.append(np.random.uniform(-0.01, 0.01, embedding_size).astype("float32")) print("{} words were not found".format(len(words_not_found))) # one for UNK and one for zero padding wv_matrix.append(np.random.uniform(-0.01, 0.01, embedding_size).astype("float32")) wv_matrix.append(np.zeros(embedding_size).astype("float32")) wv_matrix = np.array(wv_matrix) params["WV_MATRIX"] = wv_matrix model = CNN(**params).cuda(params["GPU"]) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) criterion = nn.CrossEntropyLoss() scheduler = lr_scheduler.MultiStepLR( optimizer, params['Learning_SCHEDULE'], gamma=0.1) max_dev_acc = 0 max_test_acc = 0 best_model = None model.train() for e in range(params["EPOCH"]): train_set = shuffle(train_set) train_losses = AverageMeter() train_accuracies = AverageMeter() scheduler.step() current_lr = optimizer.param_groups[0]['lr'] model.train() for i in tqdm.tqdm(range(0, len(train_set), params["BATCH_SIZE"])): batch_range = min(params["BATCH_SIZE"], len(train_set) - i) # add random dropping words: batch_x = [] for sent in train_set[i:i + batch_range]: x_sent = sent[0] drop_thre = 0.2 x_collected_words = [] for x_word in x_sent: p = random.uniform(0, 1) if p >= drop_thre: x_collected_words.append(data_info["tag2idx"][x_word]) if len(x_collected_words) >= params["MAX_SENT_LEN"]: batch_x.append(x_collected_words[:params["MAX_SENT_LEN"]]) else: batch_x.append(x_collected_words + [params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(x_collected_words))) batch_y_numpy = [c[1] for c in train_set[i:i + batch_range]] batch_x = Variable(torch.LongTensor(batch_x)).cuda(params["GPU"]) batch_y = Variable(torch.FloatTensor(batch_y_numpy)).cuda(params["GPU"]) optimizer.zero_grad() pred = model(batch_x) # loss = criterion(pred, batch_y) log_softmax_output = F.log_softmax(pred, dim=1) loss = - torch.sum(log_softmax_output * batch_y) / pred.shape[0] loss.backward() nn.utils.clip_grad_norm_(parameters, max_norm=params["NORM_LIMIT"]) optimizer.step() pred = np.argmax(pred.cpu().data.numpy(), axis=1) acc = sum([1 if y[p] > 0 else 0 for p, y in zip(pred, batch_y_numpy)]) / len(pred) train_losses.update(loss.item(), batch_range) train_accuracies.update(acc, batch_range) dev_acc, dev_loss = test(val_set ,data_info, model, params, criterion) # if params["EARLY_STOPPING"] and dev_acc <= pre_dev_acc: # print("early stopping by dev_acc!") # break # else: # pre_dev_acc = dev_acc if dev_acc > max_dev_acc: max_dev_acc = dev_acc best_model = copy.deepcopy(model) print("epoch:", e + 1, ' lr:', '{:.6f}'.format(current_lr), " dev_acc:", dev_acc, ' dev_loss:', dev_loss, " train_acc:", train_accuracies.avg, ' train_losses:', train_losses.avg, ' max_dev_acc ', max_dev_acc) print("max dev acc:", max_dev_acc, "test acc:", max_test_acc) return best_model
def train(train_set, val_set, args_data, args_model, args_hyper): wordvec_type = 'selftrained' # or selftrained or googlenews wordvec_file = '/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/' \ '{}_extracted_w2v_wordnet_synsets_py3.pl'.format(wordvec_type) print("Loading from {}".format(wordvec_type)) pretrained_w2v = loadpickle(wordvec_file) #This is only creating a vocabulary that exists in th wv_matrix = [] words_not_found = [] for i in range(len(args_data.vocab)): word = args_data.idx2tag[i] if word in pretrained_w2v: wv_matrix.append(pretrained_w2v[word]) else: words_not_found.append(word) # print("{} not found in dictrionary, will use random".format(word)) wv_matrix.append(np.random.uniform(-0.01, 0.01, args_hyper.word_dim).astype("float32")) print("{} words were not found".format(len(words_not_found))) # one for UNK and one for zero padding wv_matrix.append(np.random.uniform(-0.01, 0.01, args_hyper.word_dim).astype("float32")) wv_matrix.append(np.zeros(args_hyper.word_dim).astype("float32")) wv_matrix = np.array(wv_matrix) model = TextCNN(args_model, init_wv=wv_matrix).cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, args_hyper.learning_rate) scheduler = lr_scheduler.MultiStepLR( optimizer, [int(x) for x in args_hyper.lr_schedule.split(',')], gamma=0.1) max_dev_top1 = 0 max_dev_hits = 0 max_test_acc = 0 best_model = None model.train() for e in range(args_hyper.epoch): train_set = shuffle(train_set) train_losses = AverageMeter() train_top1 = AverageMeter() train_hits = AverageMeter() scheduler.step() current_lr = optimizer.param_groups[0]['lr'] model.train() for i in tqdm.tqdm(range(0, len(train_set), args_hyper.batch_size)): batch_range = min(args_hyper.batch_size, len(train_set) - i) # add random dropping words: batch_x = [] for sent in train_set[i:i + batch_range]: x_sent = sent[0] drop_thre = 0.2 x_collected_words = [] for x_word in x_sent: p = random.uniform(0, 1) if p >= drop_thre: x_collected_words.append(args_data.tag2idx[x_word]) if len(x_collected_words) >= args_model.max_len: batch_x.append(x_collected_words[:args_model.max_len]) else: batch_x.append(x_collected_words + [args_model.vocab_size + 1] * (args_model.max_len - len(x_collected_words))) batch_y_numpy = [c[1] for c in train_set[i:i + batch_range]] batch_x = Variable(torch.LongTensor(batch_x)).cuda() batch_y = Variable(torch.FloatTensor(batch_y_numpy)).cuda() optimizer.zero_grad() model_output = model(batch_x) pred = model_output[0] raw_feature = model_output[-1] trs_feature = model_output[-2] log_softmax_output = F.log_softmax(pred, dim=1) loss_cls = - torch.sum(log_softmax_output * batch_y) / pred.shape[0] loss_l2 = ((raw_feature - trs_feature)**2).mean() loss = loss_cls + loss_l2 loss.backward() nn.utils.clip_grad_norm_(parameters, max_norm=args_hyper.max_norm) optimizer.step() pred_idx = np.argmax(pred.cpu().data.numpy(), axis=1) train_losses.update(loss.item(), batch_range) # # FIXME: to top 1 and top-hit top1_batch, _, _ = multilabelTop1(pred_idx, batch_y_numpy) train_top1.update(top1_batch, len(pred_idx)) hits_batch, _, _ = multilabelHits(pred_idx, batch_y_numpy ) train_hits.update(hits_batch, len(pred_idx)) dev_top1, dev_hits, dev_loss = test(val_set, model, args_data, args_model) if dev_top1 > max_dev_top1: max_dev_top1 = dev_top1 best_model = copy.deepcopy(model) if dev_hits > max_dev_hits: max_dev_hits = dev_hits print('epoch: {} lr: {:.6f}, dev_top1: {:.2f}, dev_hits: {:.2f}, dev_loss: {:.2f}, ' 'train_top1: {:.2f}, train_hits: {:.2f}, train_loss:{:.2f}, max_dev_top1: {:.2f}, ' 'max_dev_hits: {:.2f}'.format(e+1, current_lr, dev_top1*100, dev_hits*100, dev_loss, train_top1.avg*100, train_hits.avg*100, train_losses.avg, max_dev_top1*100, max_dev_hits*100)) print("max dev top1: {:.2f},\tmax dev hits: {:.2f}".format(max_dev_top1, max_dev_hits)) return best_model