def languageModelEval(tokens_tags, trigramProb, bigramProb, threshold_chosen, vocab_size, ngram=3, smoothing=True, k=1, lengthAverage=True, metric='f1', criterion='gridSearch', interval=0.05, show_result=True, return_likelihood=False, note=''): tokens_test = list( itertools.chain([' '.join(x[0]).split() for x in tokens_tags])) tags_test = list(itertools.chain([int(x[1]) for x in tokens_tags])) tokens_trigram, _, _, _ = ngram_model(tokens_test, n=ngram, pad_left=True, pad_right=True, left_pad_symbol="<SOS>", right_pad_symbol="<EOS>") loglikelihoods = [] for token_trigrams in tokens_trigram: token_score = 0 for length, trigram in enumerate(token_trigrams): # token_score += np.log((trigramProb.get(trigram, 0) + smoothing) / ( # bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol'])) token_score += np.log( (trigramProb.get(trigram, 0) + k * smoothing) / (bigramProb.get(' '.join(trigram.split()[:-1]), 0) + k * vocab_size)) if lengthAverage: loglikelihoods.append(token_score / (length + 1)) else: loglikelihoods.append(token_score) predicts_lm_test = [int(x <= threshold_chosen) for x in loglikelihoods] acc, precision, recall, F1 = metrics_cal(predicts_lm_test, tags_test) # pdb.set_trace() if show_result: logger.info(Counter(predicts_lm_test)) logger.info( "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}" .format(note, round(acc, 2), round(precision, 2), round(recall, 2), round(F1, 2))) if return_likelihood: return acc, precision, recall, F1, loglikelihoods else: return acc, precision, recall, F1
def evaluate_wrapper(test_words, labels, vocab2index, model, Type, args, activation='lstm', title="Evaluation", print_result=True, return_prob=False, show_dist=False, sklearn_mode='macro'): pred_probs, predicts = evaluation(test_words, vocab2index, activation, model, Type, args) predicts_tags = [decisions2Tag[x] for x in predicts] if show_dist: print("labels: ", Counter(labels)) print("predicts: ", Counter(predicts_tags)) logger.info("labels: " + str(Counter(labels))) logger.info("predicts: " + str(Counter(predicts_tags))) acc, precision, recall, F1 = metrics_cal(predicts_tags, labels, sklearn_mode=sklearn_mode) if print_result: print(title + ": acc: {} (best {}) precision: {} recall: {} F1: {} (best {})". format(round(acc, 2), round(best_acc, 2), round(precision, 2), round(recall, 2), round(F1, 2), round(best_f1, 2))) logger.info( title + ": acc: {} (best {}) precision: {} recall: {} F1: {} (best {})". format(round(acc, 2), round(best_acc, 2), round(precision, 2), round(recall, 2), round(F1, 2), round(best_f1, 2))) if return_prob: return acc, precision, recall, F1, pred_probs else: return acc, precision, recall, F1
def tuning_model(model, optimizer, word2index, new_word_set, label, minion_group, summary, tune_epoch, Type, criterion, number=20, batch=False, early_stop=False, batch_size=10, protect_epoch=0, weights=[0.5, 0.5], eval_every_iter=50, hard_sample_stop=False, dev_set=(), model_save=False, lr_decay=1, acc_gap=0.01, early_break=True, stop_strategy='early_stop', log=False, return_model=False, show_model_status=False): # TODO rotate the member in the group max_length = 16 print("Tuning model...") if log: logger.info("Tuning model...") model.train() new_word, new_word_correct = new_word_set minion_group_size = sum([len(x) for x in minion_group]) if type(new_word) == list: data_size = minion_group_size + len(new_word) elif type(new_word) == str: data_size = minion_group_size + 1 # for new_word else: data_size = minion_group_size best_acc = 0 best_dev_acc = 0 early_stop_tolerance = 1 losses = [] accs = [] position_accs = [] test_accs = [] test_accs_pos = [] dev_f1s = [] iter = 0 keep_training = True do_dev = False if stop_strategy == 'oneEpoch': tune_epoch = 1 lambda_mspl, lambda_pos = [x / sum(weights) for x in weights] if not batch: input_numerical = word_to_index(new_word, word2index, max_length, pad=True) if new_word_correct is not None and new_word != new_word_correct: correct_numerical = word_to_index(new_word_correct, word2index, max_length, pad=True) else: correct_numerical = input_numerical # mix the wrong example with the hard samples batch_nove = random.sample(minion_group, min(number, len(minion_group))) if label == 0: batch_nove.append( [input_numerical, label, correct_numerical, 0, 'tunePair']) else: for w_index, (x, y) in enumerate(zip(new_word, new_word_correct)): if x != y: break batch_nove.append([ input_numerical, label, correct_numerical, w_index, 'tunePair' ]) # target_tensor_tune = Variable(torch.tensor([x[2] for x in batch_nove]).type(Type)) my_loss = 0 for epoch in range(tune_epoch): optimizer.zero_grad() if show_model_status: print("epoch {} model training {} loss: {}".format( epoch, model.training, round(my_loss, 4))) sample_eval('bicause', vocab2index, model) random.shuffle(batch_nove) train_tensor_tune = Variable( torch.tensor([x[0] for x in batch_nove]).type(Type)) input_length_nove = [len(x[0]) for x in batch_nove] tags_train = [x[1] for x in batch_nove] pos_train = [x[3] for x in batch_nove] encoder_outputs, encoder_hidden = model(train_tensor_tune, input_length_nove, padded=False) encoder_last_outputs = encoder_outputs[:, -1, :] scores = model.projection(encoder_last_outputs) position_scores = model.position_projection(encoder_last_outputs) predicts = scores.argmax(dim=1).cpu().numpy() position_predicts = position_scores.argmax(dim=1).cpu().numpy() my_loss = lambda_mspl * criterion(scores, torch.tensor(tags_train).type(Type)) + \ lambda_pos * criterion(position_scores, torch.tensor(pos_train).type(Type)) my_loss.backward() optimizer.step() iter += 1 print("Finish tuning of {}".format(new_word)) print("avg_loss: {} acc: {} volume: {}".format( round(my_loss.item() / len(batch_nove), 6), round(sum(predicts == tags_train) / len(batch_nove), 4), len(batch_nove))) if log: logger.info("Finish tuning of {}".format(new_word)) logger.info("avg_loss: {} acc: {} volume: {}".format( round(my_loss.item() / len(batch_nove), 6), round(sum(predicts == tags_train) / len(batch_nove), 4), len(batch_nove))) else: input_numerical = [ word_to_index(x, word2index, max_length, pad=True) for x in new_word ] batch_nove = list(itertools.chain.from_iterable(minion_group)) if new_word_correct is not None and new_word != new_word_correct: correct_numerical = [ word_to_index(x, word2index, max_length, pad=True) for x in new_word_correct ] # TODO: add calculation of position batch_nove += [[ input_, label, correct_, position_cal(input_, correct_, False, 'list'), 'tunePair' ] for input_, correct_ in zip(input_numerical, correct_numerical)] else: correct_numerical = input_numerical batch_nove += [[x, label, x, 0, 'tunePair'] for x in input_numerical] # batch_size = 10 num_of_batches = int(len(batch_nove) / batch_size) hard_samples = [] inconfident_number = [] if len(dev_set) > 0: do_dev = True test_words, test_tags, test_pos_tags = dev_set test_tensor = Variable( torch.tensor([ word_to_index(x, word2index, max_length, pad=True) for x in test_words ]).type(Type)) epoch_loss = 0 my_loss = torch.tensor(0) for epoch in range(tune_epoch): if show_model_status: print("epoch {} model training {} loss: {}".format( epoch, model.training, round(my_loss.item(), 4))) # sample_eval('because', vocab2index, model) random.shuffle(batch_nove) acc = 0 position_acc = 0 # epoch_loss = 0 if not keep_training: break for i in range(num_of_batches): if not keep_training: break batch = batch_nove[i * batch_size:(i + 1) * batch_size] input_lengths = [len(x[0]) for x in batch] max_input_length = max(input_lengths) + 2 max_target_length = max([len(x[2]) for x in batch]) + 2 train_tensor = Variable( torch.tensor([x[0] for x in batch]).type(Type)) target_tensor = Variable( torch.tensor([x[2] for x in batch]).type(Type)) optimizer.zero_grad() tags_train = [x[1] for x in batch] pos_train = [x[3] for x in batch] # encoder_outputs of [batch, max_seq_len, hidden_size] # encoder_hidden of [2*layer, max_seq_len, hidden_size] encoder_outputs, encoder_hidden = model(train_tensor, input_lengths, padded=False) encoder_last_outputs = encoder_outputs[:, -1, :] scores = model.projection(encoder_last_outputs) predicts = scores.argmax(dim=1).cpu().numpy() scores_prob = softmax(scores.detach().numpy(), axis=1) position_scores = model.position_projection( encoder_last_outputs) position_predicts = position_scores.argmax(dim=1).cpu().numpy() inconfident_indexes_hign = np.where( scores_prob[:, 1] > 0.45)[0].tolist() inconfident_indexes_low = np.where( scores_prob[:, 1] < 0.55)[0].tolist() inconfident_indexes = [ x for x in inconfident_indexes_low if x in inconfident_indexes_hign ] inconfident_number.append(len(inconfident_indexes)) if lambda_pos != 0: my_loss = lambda_mspl * criterion(scores, torch.tensor(tags_train).type(Type)) + \ lambda_pos * criterion(position_scores, torch.tensor(pos_train).type(Type)) else: my_loss = criterion(scores, torch.tensor(tags_train).type(Type)) my_loss.backward() optimizer.step() acc += sum(predicts == tags_train) position_acc += sum( [x == y for x, y in zip(position_predicts, pos_train)]) epoch_loss += my_loss.item() iter += 1 if do_dev and iter % eval_every_iter == 0: div = eval_every_iter model.eval() encoder_outputs_eval, encoder_hidden_eval = model( test_tensor, len(test_words), padded=False) encoder_last_outputs_eval = encoder_outputs_eval[:, -1, :] # TODO: auxiliary task of autoencoding? # decoder_input = Variable(torch.LongTensor([v2i['<UNK>']] * batch_size)) # decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder scores_eval = model.projection(encoder_last_outputs_eval) predicts_eval = scores_eval.argmax(dim=1).cpu().numpy() test_acc = sum( [x == y for x, y in zip(predicts_eval, test_tags)]) position_scores_eval = model.position_projection( encoder_last_outputs_eval) position_predicts_eval = position_scores_eval.argmax( dim=1).cpu().numpy() test_position_acc = sum([ x == y for x, y in zip(position_predicts_eval, test_pos_tags) ]) model.train() test_accuracy, _, _, f1 = metrics_cal( predicts=predicts_eval, tags=test_tags, detail=False) losses.append(epoch_loss / (div * batch_size)) accs.append(acc / (div * batch_size)) # test_accs.append(test_acc / len(test_words)) test_accs.append(test_accuracy) test_accs_pos.append(test_position_acc / len(test_words)) dev_f1s.append(f1) print("#hard samples: ", inconfident_number[-1]) epoch_log = "[Epoch {}][Iter {}] avg_loss: {} acc: {} dev acc: {} f1: {} pos: train {} dev {} volume: {}".format( epoch, iter, round(epoch_loss / (div * batch_size), 4), round(acc / data_size, 4), round(test_accuracy, 4), round(f1, 4), round(position_acc / data_size, 4), round(test_position_acc / len(test_words), 4), len(batch_nove)) print(epoch_log) if log: logger.info(epoch_log) if hard_sample_stop and len(inconfident_indexes) == 0: keep_training = False print("[iter {}]Empty hard sample ....".format(iter)) break if test_acc / len(test_words) >= best_dev_acc: best_dev_acc = test_acc / len(test_words) elif test_acc / len(test_words) < best_dev_acc - acc_gap and \ early_stop and epoch > protect_epoch: keep_training = False print( test_acc / len(test_words), best_dev_acc, test_acc / len(test_words) < best_dev_acc - acc_gap) if log: logger.info( test_acc / len(test_words), best_dev_acc, test_acc / len(test_words) < best_dev_acc - acc_gap) early_stop += 1 if early_stop > early_stop_tolerance: print("[iter{}][lr={}]Early stopping ...".format( iter, optimizer.param_groups[0]['lr'])) if log: logger.info( "[iter{}][lr={}]Early stopping ...".format( iter, optimizer.param_groups[0]['lr'])) # keep_training = False early_stop = 1 best_dev_acc = test_acc / len(test_words) for param_group in optimizer.param_groups: curr_lr = param_group['lr'] param_group['lr'] = curr_lr * lr_decay if model_save: name = './model/incrementalTraining/{}_vol{}_batch{}_epoch{}_iter{}_devAcc{}_devF1{}_lr{}.pth.tar'.format( summary['langcode'], len(new_word), batch_size, epoch, iter, round(test_accuracy, 3), round(f1, 3), curr_lr) save_model(model, name) if early_break: break epoch_loss = 0 print("Finish tuning of {} tokens like {}".format( len(new_word), random.choice(new_word))) if log: logger.info("Finish tuning of {} tokens like {}".format( len(new_word), random.choice(new_word))) model.eval() summary['loss'].append(losses) summary['accuracy'].append(accs) if do_dev: summary['dev_acc'].append(test_accs) summary['dev_f1'].append(dev_f1s) summary['dev_acc_pos'].append(test_accs_pos) summary['trigger'].append(new_word) summary['protect_epoch'] = protect_epoch summary['epoch_stop'] = epoch if return_model: return minion_group, summary, model else: return minion_group, summary
def languageModelDev(tokens_tags, trigramProb, bigramProb, threshold_chosen, vocab_size, ngram=3, smoothing=True, k=1, lengthAverage=True, metric='f1', criterion='gridSearch', interval=0.05, show_result=True, return_likelihood=False, note=''): tokens_test = list( itertools.chain([' '.join(x[0]).split() for x in tokens_tags])) tags_test = list(itertools.chain([int(x[1]) for x in tokens_tags])) tokens_trigram, _, _, _ = ngram_model(tokens_test, n=ngram, pad_left=True, pad_right=True, left_pad_symbol="<SOS>", right_pad_symbol="<EOS>") thresholds_acc = [] thresholds_f1 = [] best_threshold_acc = [0, 0] best_threshold_f1 = [0, 0] loglikelihoods = [] for token_trigrams in tokens_trigram: token_score = 0 for length, trigram in enumerate(token_trigrams): # token_score += np.log((trigramProb.get(trigram, 0) + smoothing) / ( # bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol'])) token_score += np.log( (trigramProb.get(trigram, 0) + k * smoothing) / (bigramProb.get(' '.join(trigram.split()[:-1]), 0) + k * vocab_size)) if lengthAverage: loglikelihoods.append(token_score / (length + 1)) else: loglikelihoods.append(token_score) # use the threshold from training set to see the predicts_lm_test = [int(x <= threshold_chosen) for x in loglikelihoods] acc, precision, recall, F1 = metrics_cal(predicts_lm_test, tags_test) if show_result: logger.info("Training Set Threshold") logger.info("Dev: {}".format(str(Counter(predicts_lm_test)))) logger.info( "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}" .format(note, round(acc, 2), round(precision, 2), round(recall, 2), round(F1, 2))) # choose the threshold that will have the best F1 for threshold_sample in np.arange(np.min(loglikelihoods), np.max(loglikelihoods), interval): predicts_lm = [int(x <= threshold_sample) for x in loglikelihoods] # print(Counter(predicts_lm)) acc, precision, recall, F1 = metrics_cal(predicts_lm, tags_test) # print("acc: ", acc, " F1: ", F1) if acc > best_threshold_acc[0]: best_threshold_acc[0] = acc best_threshold_acc[1] = threshold_sample if F1 > best_threshold_f1[0]: best_threshold_f1[0] = F1 best_threshold_f1[1] = threshold_sample thresholds_acc.append(acc) thresholds_f1.append(F1) if metric == 'f1': dev_threshold = best_threshold_f1[1] elif metric == 'acc': dev_threshold = best_threshold_acc[1] predicts_lm_dev = [int(x <= dev_threshold) for x in loglikelihoods] acc, precision, recall, F1 = metrics_cal(predicts_lm_dev, tags_test) if show_result: print("Dev Set Threshold") logger.info("Dev Set Threshold") logger.info(Counter(predicts_lm_test)) logger.info( "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}" .format(note, round(acc, 2), round(precision, 2), round(recall, 2), round(F1, 2))) if return_likelihood: return acc, precision, recall, F1, dev_threshold, loglikelihoods else: return acc, precision, recall, F1, dev_threshold
def languageModelTrain( tokens_tags, ngram=3, smoothing=True, k=1, lengthAverage=True, metric='f1', criterion='gridSearch', interval=0.05, show_result=True, return_likelihood=False, setting={ 'setting': 'train', 'ngramInfo': { 2: {}, 3: {}, 'vocab': [] }, 'loglikelihoods': [], 'tokens': [], 'tags': [] }): tokens = tokens_tags try: tokens_train = list( itertools.chain([' '.join(x[0]).split() for x in tokens])) tags_train = list(itertools.chain([int(x[1]) for x in tokens])) except: pdb.set_trace() # Language Model must calculate all the loglikelihood again (unless in trading off calculation and approximation) loglikelihoods = setting['loglikelihoods'] print('length of loglikelihoods: ', len(loglikelihoods)) # print("train setting: {}".format(setting['setting'])) if setting['setting'] == 'train': tokens_trigram, trigramProb, bigramProb, vocab = ngram_model( tokens_train, n=ngram, pad_left=True, pad_right=True, left_pad_symbol="<SOS>", right_pad_symbol="<EOS>") V = len(vocab) elif setting['setting'] == 'update': tokens_trigram, trigramProb, bigramProb, vocab = ngram_model_update( tokens_train, n=ngram, pad_left=True, pad_right=True, left_pad_symbol="<SOS>", right_pad_symbol="<EOS>", ngramProbs=setting['ngramInfo']) # combine with the ngram of the original corpus corpus_ngrams = text2ngram([[x for x in y] for y in setting['tokens']], n=ngram, pad_left=True, pad_right=True, left_pad_symbol="<SOS>", right_pad_symbol="<EOS>") tokens_trigram += corpus_ngrams tags_train += setting['tags'] vocab = setting['ngramInfo']['vocab'] V = len(vocab) for token_trigrams in tokens_trigram: token_score = 0 for length, trigram in enumerate(token_trigrams): # \lambda log((C(w1w2w3) + 1)/(C(w1w2) + ||V||)) # token_score += np.log( # (trigramProb.get(trigram, 0) + smoothing) / ( # bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol'])) token_score += np.log( (trigramProb.get(trigram, 0) + k * smoothing) / (bigramProb.get(' '.join(trigram.split()[:-1]), 0) + k * V)) if lengthAverage: loglikelihoods.append(token_score / (length + 1)) else: loglikelihoods.append(token_score) thresholds_acc = [] thresholds_f1 = [] best_threshold_acc = [0, 0] best_threshold_f1 = [0, 0] # choose the threshold that will have the best F1 for threshold_sample in np.arange(np.min(loglikelihoods), np.max(loglikelihoods), interval): predicts_lm = [int(x <= threshold_sample) for x in loglikelihoods] acc, precision, recall, F1 = metrics_cal(predicts_lm, tags_train) if acc > best_threshold_acc[0]: best_threshold_acc[0] = acc best_threshold_acc[1] = threshold_sample if F1 > best_threshold_f1[0]: best_threshold_f1[0] = F1 best_threshold_f1[1] = threshold_sample thresholds_acc.append(acc) thresholds_f1.append(F1) if metric == 'f1': threshold_chosen = best_threshold_f1[1] elif metric == 'acc': threshold_chosen = best_threshold_acc[1] predicts_lm_train = [int(x <= threshold_chosen) for x in loglikelihoods] acc, precision, recall, F1 = metrics_cal(predicts_lm_train, tags_train) if show_result: print(Counter(predicts_lm_train)) logger.info("average loglikelihood: {} max: {} min: {}".format( np.mean(loglikelihoods), np.max(loglikelihoods), np.min(loglikelihoods))) logger.info("threshold: {}".format(threshold_chosen)) logger.info('Trigram Language Model train' + ": acc: {} precision: {} recall: {} F1: {}".format( round(acc, 2), round(precision, 2), round(recall, 2), round(F1, 2))) if return_likelihood: return trigramProb, bigramProb, threshold_chosen, loglikelihoods, V else: return trigramProb, bigramProb, threshold_chosen, V