Example #1
0
def languageModelEval(tokens_tags,
                      trigramProb,
                      bigramProb,
                      threshold_chosen,
                      vocab_size,
                      ngram=3,
                      smoothing=True,
                      k=1,
                      lengthAverage=True,
                      metric='f1',
                      criterion='gridSearch',
                      interval=0.05,
                      show_result=True,
                      return_likelihood=False,
                      note=''):

    tokens_test = list(
        itertools.chain([' '.join(x[0]).split() for x in tokens_tags]))
    tags_test = list(itertools.chain([int(x[1]) for x in tokens_tags]))
    tokens_trigram, _, _, _ = ngram_model(tokens_test,
                                          n=ngram,
                                          pad_left=True,
                                          pad_right=True,
                                          left_pad_symbol="<SOS>",
                                          right_pad_symbol="<EOS>")

    loglikelihoods = []

    for token_trigrams in tokens_trigram:
        token_score = 0
        for length, trigram in enumerate(token_trigrams):
            # token_score += np.log((trigramProb.get(trigram, 0) + smoothing) / (
            #         bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol']))
            token_score += np.log(
                (trigramProb.get(trigram, 0) + k * smoothing) /
                (bigramProb.get(' '.join(trigram.split()[:-1]), 0) +
                 k * vocab_size))
        if lengthAverage:
            loglikelihoods.append(token_score / (length + 1))
        else:
            loglikelihoods.append(token_score)

    predicts_lm_test = [int(x <= threshold_chosen) for x in loglikelihoods]
    acc, precision, recall, F1 = metrics_cal(predicts_lm_test, tags_test)
    # pdb.set_trace()

    if show_result:
        logger.info(Counter(predicts_lm_test))
        logger.info(
            "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}"
            .format(note, round(acc, 2), round(precision, 2), round(recall, 2),
                    round(F1, 2)))

    if return_likelihood:
        return acc, precision, recall, F1, loglikelihoods
    else:
        return acc, precision, recall, F1
Example #2
0
def evaluate_wrapper(test_words,
                     labels,
                     vocab2index,
                     model,
                     Type,
                     args,
                     activation='lstm',
                     title="Evaluation",
                     print_result=True,
                     return_prob=False,
                     show_dist=False,
                     sklearn_mode='macro'):
    pred_probs, predicts = evaluation(test_words, vocab2index, activation,
                                      model, Type, args)
    predicts_tags = [decisions2Tag[x] for x in predicts]
    if show_dist:
        print("labels: ", Counter(labels))
        print("predicts: ", Counter(predicts_tags))
        logger.info("labels: " + str(Counter(labels)))
        logger.info("predicts: " + str(Counter(predicts_tags)))

    acc, precision, recall, F1 = metrics_cal(predicts_tags,
                                             labels,
                                             sklearn_mode=sklearn_mode)

    if print_result:
        print(title +
              ": acc: {} (best {}) precision: {} recall: {} F1: {} (best {})".
              format(round(acc, 2), round(best_acc, 2), round(precision, 2),
                     round(recall, 2), round(F1, 2), round(best_f1, 2)))
        logger.info(
            title +
            ": acc: {} (best {}) precision: {} recall: {} F1: {} (best {})".
            format(round(acc, 2), round(best_acc, 2), round(precision, 2),
                   round(recall, 2), round(F1, 2), round(best_f1, 2)))

    if return_prob:
        return acc, precision, recall, F1, pred_probs
    else:
        return acc, precision, recall, F1
Example #3
0
def tuning_model(model,
                 optimizer,
                 word2index,
                 new_word_set,
                 label,
                 minion_group,
                 summary,
                 tune_epoch,
                 Type,
                 criterion,
                 number=20,
                 batch=False,
                 early_stop=False,
                 batch_size=10,
                 protect_epoch=0,
                 weights=[0.5, 0.5],
                 eval_every_iter=50,
                 hard_sample_stop=False,
                 dev_set=(),
                 model_save=False,
                 lr_decay=1,
                 acc_gap=0.01,
                 early_break=True,
                 stop_strategy='early_stop',
                 log=False,
                 return_model=False,
                 show_model_status=False):

    # TODO rotate the member in the group
    max_length = 16
    print("Tuning model...")
    if log:
        logger.info("Tuning model...")
    model.train()
    new_word, new_word_correct = new_word_set
    minion_group_size = sum([len(x) for x in minion_group])
    if type(new_word) == list:
        data_size = minion_group_size + len(new_word)
    elif type(new_word) == str:
        data_size = minion_group_size + 1  # for new_word
    else:
        data_size = minion_group_size
    best_acc = 0
    best_dev_acc = 0
    early_stop_tolerance = 1
    losses = []
    accs = []
    position_accs = []
    test_accs = []
    test_accs_pos = []
    dev_f1s = []
    iter = 0
    keep_training = True
    do_dev = False

    if stop_strategy == 'oneEpoch':
        tune_epoch = 1

    lambda_mspl, lambda_pos = [x / sum(weights) for x in weights]

    if not batch:
        input_numerical = word_to_index(new_word,
                                        word2index,
                                        max_length,
                                        pad=True)

        if new_word_correct is not None and new_word != new_word_correct:
            correct_numerical = word_to_index(new_word_correct,
                                              word2index,
                                              max_length,
                                              pad=True)
        else:
            correct_numerical = input_numerical

        # mix the wrong example with the hard samples
        batch_nove = random.sample(minion_group, min(number,
                                                     len(minion_group)))
        if label == 0:
            batch_nove.append(
                [input_numerical, label, correct_numerical, 0, 'tunePair'])
        else:
            for w_index, (x, y) in enumerate(zip(new_word, new_word_correct)):
                if x != y:
                    break
            batch_nove.append([
                input_numerical, label, correct_numerical, w_index, 'tunePair'
            ])

        # target_tensor_tune = Variable(torch.tensor([x[2] for x in batch_nove]).type(Type))
        my_loss = 0
        for epoch in range(tune_epoch):
            optimizer.zero_grad()
            if show_model_status:
                print("epoch {} model training {} loss: {}".format(
                    epoch, model.training, round(my_loss, 4)))
                sample_eval('bicause', vocab2index, model)

            random.shuffle(batch_nove)
            train_tensor_tune = Variable(
                torch.tensor([x[0] for x in batch_nove]).type(Type))
            input_length_nove = [len(x[0]) for x in batch_nove]
            tags_train = [x[1] for x in batch_nove]
            pos_train = [x[3] for x in batch_nove]
            encoder_outputs, encoder_hidden = model(train_tensor_tune,
                                                    input_length_nove,
                                                    padded=False)
            encoder_last_outputs = encoder_outputs[:, -1, :]
            scores = model.projection(encoder_last_outputs)
            position_scores = model.position_projection(encoder_last_outputs)
            predicts = scores.argmax(dim=1).cpu().numpy()
            position_predicts = position_scores.argmax(dim=1).cpu().numpy()
            my_loss = lambda_mspl * criterion(scores, torch.tensor(tags_train).type(Type)) + \
                      lambda_pos * criterion(position_scores, torch.tensor(pos_train).type(Type))
            my_loss.backward()
            optimizer.step()
            iter += 1

        print("Finish tuning of {}".format(new_word))
        print("avg_loss: {} acc: {} volume: {}".format(
            round(my_loss.item() / len(batch_nove), 6),
            round(sum(predicts == tags_train) / len(batch_nove), 4),
            len(batch_nove)))
        if log:
            logger.info("Finish tuning of {}".format(new_word))
            logger.info("avg_loss: {} acc: {} volume: {}".format(
                round(my_loss.item() / len(batch_nove), 6),
                round(sum(predicts == tags_train) / len(batch_nove), 4),
                len(batch_nove)))
    else:
        input_numerical = [
            word_to_index(x, word2index, max_length, pad=True)
            for x in new_word
        ]
        batch_nove = list(itertools.chain.from_iterable(minion_group))

        if new_word_correct is not None and new_word != new_word_correct:
            correct_numerical = [
                word_to_index(x, word2index, max_length, pad=True)
                for x in new_word_correct
            ]

            # TODO: add calculation of position
            batch_nove += [[
                input_, label, correct_,
                position_cal(input_, correct_, False, 'list'), 'tunePair'
            ] for input_, correct_ in zip(input_numerical, correct_numerical)]

        else:
            correct_numerical = input_numerical
            batch_nove += [[x, label, x, 0, 'tunePair']
                           for x in input_numerical]

        # batch_size = 10
        num_of_batches = int(len(batch_nove) / batch_size)

        hard_samples = []
        inconfident_number = []

        if len(dev_set) > 0:
            do_dev = True
            test_words, test_tags, test_pos_tags = dev_set
            test_tensor = Variable(
                torch.tensor([
                    word_to_index(x, word2index, max_length, pad=True)
                    for x in test_words
                ]).type(Type))

        epoch_loss = 0
        my_loss = torch.tensor(0)
        for epoch in range(tune_epoch):
            if show_model_status:
                print("epoch {} model training {} loss: {}".format(
                    epoch, model.training, round(my_loss.item(), 4)))
                # sample_eval('because', vocab2index, model)
            random.shuffle(batch_nove)
            acc = 0
            position_acc = 0
            # epoch_loss = 0
            if not keep_training:
                break
            for i in range(num_of_batches):
                if not keep_training:
                    break
                batch = batch_nove[i * batch_size:(i + 1) * batch_size]
                input_lengths = [len(x[0]) for x in batch]
                max_input_length = max(input_lengths) + 2
                max_target_length = max([len(x[2]) for x in batch]) + 2
                train_tensor = Variable(
                    torch.tensor([x[0] for x in batch]).type(Type))
                target_tensor = Variable(
                    torch.tensor([x[2] for x in batch]).type(Type))
                optimizer.zero_grad()
                tags_train = [x[1] for x in batch]
                pos_train = [x[3] for x in batch]

                # encoder_outputs of [batch, max_seq_len, hidden_size]
                # encoder_hidden of [2*layer, max_seq_len, hidden_size]
                encoder_outputs, encoder_hidden = model(train_tensor,
                                                        input_lengths,
                                                        padded=False)
                encoder_last_outputs = encoder_outputs[:, -1, :]

                scores = model.projection(encoder_last_outputs)
                predicts = scores.argmax(dim=1).cpu().numpy()
                scores_prob = softmax(scores.detach().numpy(), axis=1)

                position_scores = model.position_projection(
                    encoder_last_outputs)
                position_predicts = position_scores.argmax(dim=1).cpu().numpy()

                inconfident_indexes_hign = np.where(
                    scores_prob[:, 1] > 0.45)[0].tolist()
                inconfident_indexes_low = np.where(
                    scores_prob[:, 1] < 0.55)[0].tolist()
                inconfident_indexes = [
                    x for x in inconfident_indexes_low
                    if x in inconfident_indexes_hign
                ]
                inconfident_number.append(len(inconfident_indexes))
                if lambda_pos != 0:
                    my_loss = lambda_mspl * criterion(scores, torch.tensor(tags_train).type(Type)) + \
                              lambda_pos * criterion(position_scores, torch.tensor(pos_train).type(Type))
                else:
                    my_loss = criterion(scores,
                                        torch.tensor(tags_train).type(Type))
                my_loss.backward()
                optimizer.step()

                acc += sum(predicts == tags_train)
                position_acc += sum(
                    [x == y for x, y in zip(position_predicts, pos_train)])
                epoch_loss += my_loss.item()

                iter += 1
                if do_dev and iter % eval_every_iter == 0:
                    div = eval_every_iter
                    model.eval()
                    encoder_outputs_eval, encoder_hidden_eval = model(
                        test_tensor, len(test_words), padded=False)
                    encoder_last_outputs_eval = encoder_outputs_eval[:, -1, :]

                    # TODO: auxiliary task of autoencoding?
                    # decoder_input = Variable(torch.LongTensor([v2i['<UNK>']] * batch_size))
                    # decoder_hidden = encoder_hidden[:decoder.n_layers]  # Use last (forward) hidden state from encoder
                    scores_eval = model.projection(encoder_last_outputs_eval)
                    predicts_eval = scores_eval.argmax(dim=1).cpu().numpy()
                    test_acc = sum(
                        [x == y for x, y in zip(predicts_eval, test_tags)])

                    position_scores_eval = model.position_projection(
                        encoder_last_outputs_eval)
                    position_predicts_eval = position_scores_eval.argmax(
                        dim=1).cpu().numpy()
                    test_position_acc = sum([
                        x == y
                        for x, y in zip(position_predicts_eval, test_pos_tags)
                    ])
                    model.train()

                    test_accuracy, _, _, f1 = metrics_cal(
                        predicts=predicts_eval, tags=test_tags, detail=False)
                    losses.append(epoch_loss / (div * batch_size))
                    accs.append(acc / (div * batch_size))
                    # test_accs.append(test_acc / len(test_words))
                    test_accs.append(test_accuracy)
                    test_accs_pos.append(test_position_acc / len(test_words))
                    dev_f1s.append(f1)
                    print("#hard samples: ", inconfident_number[-1])
                    epoch_log = "[Epoch {}][Iter {}] avg_loss: {} acc: {} dev acc: {} f1: {} pos: train {} dev {} volume: {}".format(
                        epoch, iter, round(epoch_loss / (div * batch_size), 4),
                        round(acc / data_size, 4), round(test_accuracy, 4),
                        round(f1, 4), round(position_acc / data_size, 4),
                        round(test_position_acc / len(test_words), 4),
                        len(batch_nove))
                    print(epoch_log)

                    if log:
                        logger.info(epoch_log)

                    if hard_sample_stop and len(inconfident_indexes) == 0:
                        keep_training = False
                        print("[iter {}]Empty hard sample ....".format(iter))
                        break

                    if test_acc / len(test_words) >= best_dev_acc:
                        best_dev_acc = test_acc / len(test_words)

                    elif test_acc / len(test_words) < best_dev_acc - acc_gap and \
                            early_stop and epoch > protect_epoch:
                        keep_training = False
                        print(
                            test_acc / len(test_words), best_dev_acc,
                            test_acc / len(test_words) <
                            best_dev_acc - acc_gap)
                        if log:
                            logger.info(
                                test_acc / len(test_words), best_dev_acc,
                                test_acc / len(test_words) <
                                best_dev_acc - acc_gap)
                        early_stop += 1
                        if early_stop > early_stop_tolerance:
                            print("[iter{}][lr={}]Early stopping ...".format(
                                iter, optimizer.param_groups[0]['lr']))
                            if log:
                                logger.info(
                                    "[iter{}][lr={}]Early stopping ...".format(
                                        iter, optimizer.param_groups[0]['lr']))
                            # keep_training = False
                            early_stop = 1
                            best_dev_acc = test_acc / len(test_words)

                            for param_group in optimizer.param_groups:
                                curr_lr = param_group['lr']
                                param_group['lr'] = curr_lr * lr_decay

                            if model_save:
                                name = './model/incrementalTraining/{}_vol{}_batch{}_epoch{}_iter{}_devAcc{}_devF1{}_lr{}.pth.tar'.format(
                                    summary['langcode'], len(new_word),
                                    batch_size, epoch, iter,
                                    round(test_accuracy, 3), round(f1,
                                                                   3), curr_lr)
                                save_model(model, name)

                            if early_break:
                                break

                    epoch_loss = 0

        print("Finish tuning of {} tokens like {}".format(
            len(new_word), random.choice(new_word)))
        if log:
            logger.info("Finish tuning of {} tokens like {}".format(
                len(new_word), random.choice(new_word)))

    model.eval()
    summary['loss'].append(losses)
    summary['accuracy'].append(accs)
    if do_dev:
        summary['dev_acc'].append(test_accs)
        summary['dev_f1'].append(dev_f1s)
        summary['dev_acc_pos'].append(test_accs_pos)
    summary['trigger'].append(new_word)
    summary['protect_epoch'] = protect_epoch
    summary['epoch_stop'] = epoch
    if return_model:
        return minion_group, summary, model
    else:
        return minion_group, summary
Example #4
0
def languageModelDev(tokens_tags,
                     trigramProb,
                     bigramProb,
                     threshold_chosen,
                     vocab_size,
                     ngram=3,
                     smoothing=True,
                     k=1,
                     lengthAverage=True,
                     metric='f1',
                     criterion='gridSearch',
                     interval=0.05,
                     show_result=True,
                     return_likelihood=False,
                     note=''):

    tokens_test = list(
        itertools.chain([' '.join(x[0]).split() for x in tokens_tags]))
    tags_test = list(itertools.chain([int(x[1]) for x in tokens_tags]))
    tokens_trigram, _, _, _ = ngram_model(tokens_test,
                                          n=ngram,
                                          pad_left=True,
                                          pad_right=True,
                                          left_pad_symbol="<SOS>",
                                          right_pad_symbol="<EOS>")

    thresholds_acc = []
    thresholds_f1 = []
    best_threshold_acc = [0, 0]
    best_threshold_f1 = [0, 0]

    loglikelihoods = []
    for token_trigrams in tokens_trigram:
        token_score = 0
        for length, trigram in enumerate(token_trigrams):
            # token_score += np.log((trigramProb.get(trigram, 0) + smoothing) / (
            #         bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol']))
            token_score += np.log(
                (trigramProb.get(trigram, 0) + k * smoothing) /
                (bigramProb.get(' '.join(trigram.split()[:-1]), 0) +
                 k * vocab_size))
        if lengthAverage:
            loglikelihoods.append(token_score / (length + 1))
        else:
            loglikelihoods.append(token_score)

    # use the threshold from training set to see the
    predicts_lm_test = [int(x <= threshold_chosen) for x in loglikelihoods]
    acc, precision, recall, F1 = metrics_cal(predicts_lm_test, tags_test)
    if show_result:
        logger.info("Training Set Threshold")
        logger.info("Dev: {}".format(str(Counter(predicts_lm_test))))
        logger.info(
            "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}"
            .format(note, round(acc, 2), round(precision, 2), round(recall, 2),
                    round(F1, 2)))

    # choose the threshold that will have the best F1
    for threshold_sample in np.arange(np.min(loglikelihoods),
                                      np.max(loglikelihoods), interval):
        predicts_lm = [int(x <= threshold_sample) for x in loglikelihoods]
        #         print(Counter(predicts_lm))
        acc, precision, recall, F1 = metrics_cal(predicts_lm, tags_test)
        #         print("acc: ", acc, " F1: ", F1)

        if acc > best_threshold_acc[0]:
            best_threshold_acc[0] = acc
            best_threshold_acc[1] = threshold_sample

        if F1 > best_threshold_f1[0]:
            best_threshold_f1[0] = F1
            best_threshold_f1[1] = threshold_sample

        thresholds_acc.append(acc)
        thresholds_f1.append(F1)

    if metric == 'f1':
        dev_threshold = best_threshold_f1[1]
    elif metric == 'acc':
        dev_threshold = best_threshold_acc[1]

    predicts_lm_dev = [int(x <= dev_threshold) for x in loglikelihoods]
    acc, precision, recall, F1 = metrics_cal(predicts_lm_dev, tags_test)

    if show_result:
        print("Dev Set Threshold")
        logger.info("Dev Set Threshold")
        logger.info(Counter(predicts_lm_test))
        logger.info(
            "Trigram Language Model {}: acc: {} precision: {} recall: {} F1: {}"
            .format(note, round(acc, 2), round(precision, 2), round(recall, 2),
                    round(F1, 2)))

    if return_likelihood:
        return acc, precision, recall, F1, dev_threshold, loglikelihoods
    else:
        return acc, precision, recall, F1, dev_threshold
Example #5
0
def languageModelTrain(
    tokens_tags,
    ngram=3,
    smoothing=True,
    k=1,
    lengthAverage=True,
    metric='f1',
    criterion='gridSearch',
    interval=0.05,
    show_result=True,
    return_likelihood=False,
    setting={
        'setting': 'train',
        'ngramInfo': {
            2: {},
            3: {},
            'vocab': []
        },
        'loglikelihoods': [],
        'tokens': [],
        'tags': []
    }):
    tokens = tokens_tags
    try:
        tokens_train = list(
            itertools.chain([' '.join(x[0]).split() for x in tokens]))
        tags_train = list(itertools.chain([int(x[1]) for x in tokens]))
    except:
        pdb.set_trace()

    # Language Model must calculate all the loglikelihood again (unless in trading off calculation and approximation)
    loglikelihoods = setting['loglikelihoods']
    print('length of loglikelihoods: ', len(loglikelihoods))
    # print("train setting: {}".format(setting['setting']))
    if setting['setting'] == 'train':
        tokens_trigram, trigramProb, bigramProb, vocab = ngram_model(
            tokens_train,
            n=ngram,
            pad_left=True,
            pad_right=True,
            left_pad_symbol="<SOS>",
            right_pad_symbol="<EOS>")
        V = len(vocab)
    elif setting['setting'] == 'update':
        tokens_trigram, trigramProb, bigramProb, vocab = ngram_model_update(
            tokens_train,
            n=ngram,
            pad_left=True,
            pad_right=True,
            left_pad_symbol="<SOS>",
            right_pad_symbol="<EOS>",
            ngramProbs=setting['ngramInfo'])
        # combine with the ngram of the original corpus
        corpus_ngrams = text2ngram([[x for x in y] for y in setting['tokens']],
                                   n=ngram,
                                   pad_left=True,
                                   pad_right=True,
                                   left_pad_symbol="<SOS>",
                                   right_pad_symbol="<EOS>")
        tokens_trigram += corpus_ngrams
        tags_train += setting['tags']
        vocab = setting['ngramInfo']['vocab']
        V = len(vocab)

    for token_trigrams in tokens_trigram:
        token_score = 0

        for length, trigram in enumerate(token_trigrams):
            # \lambda log((C(w1w2w3) + 1)/(C(w1w2) + ||V||))
            # token_score += np.log(
            #         (trigramProb.get(trigram, 0) + smoothing) / (
            #         bigramProb.get(' '.join(trigram.split()[:-1]), 0) + bigramProb['vol']))
            token_score += np.log(
                (trigramProb.get(trigram, 0) + k * smoothing) /
                (bigramProb.get(' '.join(trigram.split()[:-1]), 0) + k * V))

        if lengthAverage:
            loglikelihoods.append(token_score / (length + 1))
        else:
            loglikelihoods.append(token_score)

    thresholds_acc = []
    thresholds_f1 = []
    best_threshold_acc = [0, 0]
    best_threshold_f1 = [0, 0]

    # choose the threshold that will have the best F1
    for threshold_sample in np.arange(np.min(loglikelihoods),
                                      np.max(loglikelihoods), interval):
        predicts_lm = [int(x <= threshold_sample) for x in loglikelihoods]
        acc, precision, recall, F1 = metrics_cal(predicts_lm, tags_train)

        if acc > best_threshold_acc[0]:
            best_threshold_acc[0] = acc
            best_threshold_acc[1] = threshold_sample

        if F1 > best_threshold_f1[0]:
            best_threshold_f1[0] = F1
            best_threshold_f1[1] = threshold_sample

        thresholds_acc.append(acc)
        thresholds_f1.append(F1)

    if metric == 'f1':
        threshold_chosen = best_threshold_f1[1]
    elif metric == 'acc':
        threshold_chosen = best_threshold_acc[1]

    predicts_lm_train = [int(x <= threshold_chosen) for x in loglikelihoods]
    acc, precision, recall, F1 = metrics_cal(predicts_lm_train, tags_train)

    if show_result:
        print(Counter(predicts_lm_train))
        logger.info("average loglikelihood: {} max: {} min: {}".format(
            np.mean(loglikelihoods), np.max(loglikelihoods),
            np.min(loglikelihoods)))
        logger.info("threshold: {}".format(threshold_chosen))
        logger.info('Trigram Language Model train' +
                    ": acc: {} precision: {} recall: {} F1: {}".format(
                        round(acc, 2), round(precision, 2), round(recall, 2),
                        round(F1, 2)))

    if return_likelihood:
        return trigramProb, bigramProb, threshold_chosen, loglikelihoods, V
    else:
        return trigramProb, bigramProb, threshold_chosen, V