Esempio n. 1
0
 def __init__(self, jaccard_thresh=0.5, neg_pos=3, focal=False, device='cpu'):
     super(MultiBoxLoss, self).__init__()
     self.jaccard_thresh = jaccard_thresh  # 0.5 関数matchのjaccard係数の閾値
     self.negpos_ratio = neg_pos  # 3:1 Hard Negative Miningの負と正の比率
     self.device = device  # CPUとGPUのいずれで計算するのか
     self.floss = focal
     if focal:
         from utils.focalloss import FocalLoss
         self.focal = FocalLoss()
Esempio n. 2
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    num_labels = NUM_EMO

    vocab_size = VOCAB_SIZE

    print('NUM of VOCAB' + str(vocab_size))
    train_data = EmotionDataLoader(X_train, y_train, PAD_LEN)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN)
    dev_loader = DataLoader(dev_data,
                            batch_size=int(BATCH_SIZE / 3) + 2,
                            shuffle=False)

    test_data = EmotionDataLoader(X_test, y_test, PAD_LEN)
    test_loader = DataLoader(test_data,
                             batch_size=int(BATCH_SIZE / 3) + 2,
                             shuffle=False)

    model = AttentionLSTMClassifier(EMBEDDING_DIM,
                                    HIDDEN_DIM,
                                    vocab_size,
                                    num_labels,
                                    BATCH_SIZE,
                                    att_mode=opt.attention,
                                    soft_last=False,
                                    use_glove=USE_GLOVE,
                                    add_linear=ADD_LINEAR,
                                    max_pool=MAX_POOLING)

    if USE_GLOVE:
        model.load_embedding(tokenizer.get_embeddings())
    # multi-GPU
    # model = nn.DataParallel(model)
    model.cuda()

    if opt.loss == 'ce':
        loss_criterion = nn.CrossEntropyLoss()  #
        print('Using ce loss')
    elif opt.loss == 'focal':
        loss_criterion = FocalLoss(gamma=opt.focal, reduce=True)
        print('Using focal loss, gamma=', opt.focal)
    else:
        raise Exception('loss option not recognised')

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    es = EarlyStopping(patience=PATIENCE)

    old_model = None
    for epoch in range(1, 300):
        print('Epoch: ' + str(epoch) + '===================================')
        train_loss = 0
        model.train()
        for i, (data, seq_len,
                label) in tqdm(enumerate(train_loader),
                               total=len(train_data) / BATCH_SIZE):
            optimizer.zero_grad()

            data_text = [tokenizer.decode_ids(x) for x in data]
            with torch.no_grad():
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(data.cuda(), seq_len, elmo_emb,
                           emoji_encoding.cuda())
            loss = loss_criterion(y_pred, label.view(-1).cuda())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()
            train_loss += loss.data.cpu().numpy() * data.shape[0]
            del y_pred, loss

        test_loss = 0
        model.eval()
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            with torch.no_grad():

                data_text = [tokenizer.decode_ids(x) for x in _data]
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

                y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                               emoji_encoding.cuda())
                loss = loss_criterion(y_pred, _label.view(-1).cuda())
                test_loss += loss.data.cpu().numpy() * _data.shape[0]
                del y_pred, loss

        print("Train Loss: " + str(train_loss / len(train_data)) + \
              " Evaluation: " + str(test_loss / len(dev_data)))

        if es.step(test_loss):  # overfitting
            del model
            print('overfitting, loading best model ...')
            model = old_model
            break
        else:
            if es.is_best():
                if old_model is not None:
                    del old_model
                print('saving best model ...')
                old_model = deepcopy(model)
            else:
                print('not best model, ignoring ...')
                if old_model is None:
                    old_model = deepcopy(model)

    with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f:
        torch.save(model.state_dict(), f)

    pred_list = []
    model.eval()
    for _, (_data, _seq_len, _label) in enumerate(test_loader):
        with torch.no_grad():
            data_text = [tokenizer.decode_ids(x) for x in _data]
            character_ids = batch_to_ids(data_text).cuda()
            elmo_emb = elmo(character_ids)['elmo_representations']
            elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

            emoji_tokenized, _, _ = st.tokenize_sentences(
                [' '.join(x) for x in data_text])
            emoji_encoding = emoji_model(
                torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                           emoji_encoding.cuda())
            pred_list.append(
                y_pred.data.cpu().numpy())  # x[np.where( x > 3.0 )]
            del y_pred

    pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)

    return pred_list
Esempio n. 3
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN)
        train_data_loader = DataLoader(train_data_set,
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)

        dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        gradient_accumulation_steps = 1
        num_train_steps = int(
            len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps *
            MAX_EPOCH)

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            model = BERT_classifer.from_pretrained(BERT_MODEL)
            model.add_output_layer(BERT_MODEL, NUM_EMO)
            model = nn.DataParallel(model)
            model.cuda()

            # BERT optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }]

            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=learning_rate,
                                 warmup=0.1,
                                 t_total=num_train_steps)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE)
            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_best = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                print('Begin training epoch:', num_epoch)
                sys.stdout.flush()
                train_loss = 0
                model.train()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in tqdm(enumerate(train_data_loader),
                                         total=len(train_data_set) /
                                         BATCH_SIZE):
                    optimizer.zero_grad()

                    if USE_TOKEN_TYPE:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(),
                                                   segments.cuda())
                    else:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda())

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * tokens.shape[0]

                    del loss, pred

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda(),
                                                       segments.cuda())
                        else:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda())

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                     e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * tokens.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss

                # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)
                # gold_list = np.concatenate(gold_list, axis=0)
                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                # checking diverge
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    if num_epoch == 1:
                        is_diverged = True
                        final_pred_best = deepcopy(final_pred_list_test)
                        pred_list_test_best = deepcopy(pred_list_test)
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                print('Gold Dev ...')
                pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        pred_list_test.append(pred.data.cpu().numpy())

                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                print('Gold Test ...')
                final_pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_test_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        final_pred_list_test.append(pred.data.cpu().numpy())

                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue
            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)

            del model
            break
Esempio n. 4
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      CONV_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    CONV_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalPredictor(SENT_EMB_DIM,
                                          SENT_HIDDEN_SIZE,
                                          num_of_vocab,
                                          USE_ELMO=True,
                                          ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()
            # model = nn.DataParallel(model)
            # model.to(device)
            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=opt.gamma)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            else:
                raise ValueError

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('classification reweight: ', weight_list)
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            # best_model = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                train_loss = 0
                model.train()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):
                    optimizer.zero_grad()
                    elmo_a = elmo_encode(a)
                    elmo_b = elmo_encode(b)
                    elmo_c = elmo_encode(c)

                    pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                               b_len, c.cuda(), c_len,
                                               emoji_a.cuda(), emoji_b.cuda(),
                                               emoji_c.cuda(), elmo_a, elmo_b,
                                               elmo_c)

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0]

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * a.shape[0]
                    del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\
                        in enumerate(dev_data_loader):
                    with torch.no_grad():

                        elmo_a = elmo_encode(a)
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                                   b_len, c.cuda(), c_len,
                                                   emoji_a.cuda(),
                                                   emoji_b.cuda(),
                                                   emoji_c.cuda(), elmo_a,
                                                   elmo_b, elmo_c)

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(
                            torch.gather(weight_label, 0,
                                         e_c.view(-1).cuda()),
                            loss_label) / e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                # Gold Dev testing...
                print('Gold Dev testing....')
                pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                # Testing
                print('Gold test testing...')
                final_pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(test_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue

            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)
            del model
            break
Esempio n. 5
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        # for one fold, test data comes from k fold split.
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      EMAI_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    EMAI_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        final_pred_best = None

        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalAttPredictor(SENT_EMB_DIM,
                                             SENT_HIDDEN_SIZE,
                                             CTX_LSTM_DIM,
                                             num_of_vocab,
                                             SENT_PAD_LEN,
                                             id2word,
                                             USE_ELMO=True,
                                             ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()

            # model = nn.DataParallel(model)
            # model.to(device)

            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=GAMMA)

            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal)

            elif opt.loss == 'ce':
                loss_criterion = nn.BCELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_list_test = None

            result_print = {}

            for num_epoch in range(MAX_EPOCH):

                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                ## Training step
                train_loss = 0
                model.train()

                for i, (a, a_len, emoji_a, e_c) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):

                    optimizer.zero_grad()
                    e_c = e_c.type(torch.float)
                    pred = model(a.cuda(), a_len, emoji_a.cuda())
                    loss_label = loss_criterion(pred.squeeze(1),
                                                e_c.view(-1).cuda()).cuda()

                    # training trilogy
                    loss_label.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss_label.data.cpu().numpy() * a.shape[0]
                    del pred, loss_label

                ## Evaluatation step
                model.eval()
                dev_loss = 0
                # pred_list = []
                for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader):

                    with torch.no_grad():
                        e_c = e_c.type(torch.float)
                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        loss_label = loss_criterion(
                            pred.squeeze(1),
                            e_c.view(-1).cuda()).cuda()

                        dev_loss += loss_label.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss_label

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))

                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)

                # Gold Test testing
                print('Final test testing...')
                final_pred_list_test = []
                model.eval()

                for i, (a, a_len,
                        emoji_a) in enumerate(final_test_data_loader):

                    with torch.no_grad():

                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del a, pred
                print("final_pred_list_test", len(final_pred_list_test))
                final_pred_list_test = np.concatenate(final_pred_list_test,
                                                      axis=0)
                final_pred_list_test = np.squeeze(final_pred_list_test, axis=1)
                print("final_pred_list_test_concat", len(final_pred_list_test))

                accuracy, precision, recall, f1 = get_metrics(
                    np.asarray(final_test_target_list),
                    np.asarray(final_pred_list_test))

                result_print.update(
                    {num_epoch: [accuracy, precision, recall, f1]})

                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)

                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)

            with open(result_path, 'wb') as w:
                pkl.dump(result_print, w)

            if is_diverged:
                print("Reinitialize model ...")
                del model

                continue

            real_test_results.append(np.asarray(final_pred_best))
            # saving model for inference
            torch.save(model.state_dict(), opt.out_path)
            del model
            break
Esempio n. 6
0
# setting of train phase
model = PCB(class_num=4768)
# load pretrained para without classifier

if use_gpu:
    model = model.cuda()

# set the criterion
triplet_selector = SemihardNegativeTripletSelector(opt.margin)
criterion_tri = OnlineTripletLoss(opt.margin, triplet_selector)

criterion_part = nn.CrossEntropyLoss()
# criterion_part=CrossEntropyLabelSmooth(4768)
criterion_center = CenterLoss(4768)
criterion_focal = FocalLoss(gamma=2)

# updating rule for parameter
ignored_params = list(map(id, model.model.fc.parameters()))
ignored_params += (
    list(map(id, model.classifier0.parameters())) +
    list(map(id, model.classifier1.parameters())) +
    list(map(id, model.classifier2.parameters())) +
    list(map(id, model.classifier3.parameters())) +
    list(map(id, model.classifier4.parameters())) +
    list(map(id, model.classifier5.parameters())) +
    list(map(id, model.classifier6.parameters()))
    # +list(map(id, model.classifier7.parameters() ))
)
base_params = filter(lambda p: id(p) not in ignored_params, model.parameters())
optimizer_ft = optim.SGD(
Esempio n. 7
0
def train(args):
    # step0: parse config
    best_acc = 0
    new_config = {
        "model": args.model,
        "num_workers": args.num_workers,
        "batch_size": args.batch_size,
        "load_model_path": args.load_model_path
    }
    opt.parse(new_config)
    # step1:model
    model = getattr(models, opt.model)()

    # step2: data
    dataset = getattr(datasets, opt.dataset)
    train_data = dataset(opt.train_data_root, train=True)
    val_data = dataset(opt.train_data_root, train=False)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  pin_memory=True,
                                  shuffle=True,
                                  num_workers=opt.num_workers)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                pin_memory=True,
                                shuffle=False,
                                num_workers=opt.num_workers)
    # step3: criterion and optimizer
    #criterion = torch.nn.CrossEntropyLoss()
    criterion = FocalLoss(gamma=2.0)
    lr = opt.lr
    optimizer = torch.optim.Adam(model.parameters(),
                                 opt.lr,
                                 weight_decay=opt.weight_decay)
    # step4: meters
    loss_meter = meter.AverageValueMeter()
    acc_meter = meter.AverageValueMeter()
    confusion_matrix = meter.ConfusionMeter(2)
    previous_loss = 1e10
    if opt.load_model_path is None:
        opt.load_model_path = get_lastest_model(prefix=opt.model)
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model.to(opt.device)
    if opt.load_model_path:
        model.load_state_dict(torch.load(opt.load_model_path))
        model.eval()
        _, best_acc = val(model, val_dataloader)
        logging.info("Resuming from " + opt.load_model_path + " with acc: " +
                     str(best_acc))
    prefix = 'output/' + opt.model
    # train
    for epoch in range(opt.max_epoch):
        model.train()
        loss_meter.reset()
        acc_meter.reset()
        confusion_matrix.reset()
        nIters = len(train_dataloader)
        pbar = tqdm(train_dataloader)
        start = time.time()
        for iter, (data, label) in enumerate(pbar):
            # train model
            input = data.to(opt.device)
            target = label.to(opt.device)
            optimizer.zero_grad()
            y_pred = model(input)
            loss = criterion(y_pred, target)
            prec1 = accuracy(y_pred.data, target)
            loss.backward()
            optimizer.step()
            # meters update
            loss_meter.add(loss.item())
            acc_meter.add(prec1[0].item())
            confusion_matrix.add(y_pred.detach(), target.detach())
            if sys.stderr.isatty():
                log_str = "{epoch}: Loss:{loss.val:.5f} Acc:{acc.val:.3f}".format(
                    epoch=epoch, loss=loss_meter, acc=acc_meter)
                pbar.set_description(log_str)
            else:
                if iter % opt.print_freq == 0:
                    log_str = "{iter}/{len}: Loss:{loss.val:.5f} Acc:{acc.val:.3f}".format(
                        iter=iter, len=nIters, loss=loss_meter, acc=acc_meter)
                    logging.info(log_str)
        logging.info(log_str)
        # validate and visualize
        end = time.time()
        if not sys.stderr.isatty():
            logging.info(str(epoch) + ": time " + str(end - start) + "s")
        val_cm, val_accuracy = val(model, val_dataloader)
        if val_accuracy > best_acc:
            best_acc = val_accuracy
            #name = time.strftime(prefix + '_%m%d_%H:%M:%S.pth')
            name = prefix + "_best.pth"
            torch.save(model.state_dict(), name)
        torch.save(model.state_dict(), prefix + "_last.pth")
        logging.info("Val {epoch}: Loss: {loss},Acc: {acc},lr: {lr}".format(
            epoch=epoch, acc=val_accuracy, loss=loss_meter.value()[0], lr=lr))
        #logging.info("confusion_matrix:{val_cm}".format(val_cm = str(val_cm.value())))
        # update learning rate
        if loss_meter.value()[0] > previous_loss:
            if lr > 1e-5:
                lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        previous_loss = loss_meter.value()[0]
Esempio n. 8
0
                                       weight_decay=weigth_decay)
            elif optim_type.lower() == 'sgd':
                optimizer = optim.SGD(model.parameters(),
                                      lr=lr,
                                      weigth_decay=weigth_decay)
            else:
                raise Exception('Other optimizer is not supported')

            if loss_type.lower() == 'cross_entropy':
                if torch.cuda.is_available():
                    criterion = nn.CrossEntropyLoss().cuda()
                else:
                    criterion = nn.CrossEntropyLoss()
            elif loss_type.lower() == 'focal':
                if torch.cuda.is_available():
                    criterion = FocalLoss(n_label).cuda()
                else:
                    criterion = FocalLoss(n_label)
            elif loss_type.lower() == 'binary_cross_entropy':
                if torch.cuda.is_available():
                    criterion = nn.BCEWithLogitsLoss().cuda()
                else:
                    criterion = nn.BCEWithLogitsLoss()
            else:
                raise Exception('%s loss is not supported' % loss_type.lower())
            print_progress('TRAIN coinfg Done')
        else:
            raise Exception('TRAIN should be configured in config file')
    else:
        if 'DECODE' in sessions:
            print_progress('Start DECODE config')
Esempio n. 9
0
        model.backbone.apply(weights_init)  # TODO@LYC: Init Header
        # model.head.apply(weights_init_without_kaiming) # not very effective
        print("model initiated without pretrain")
    for p in model.parameters():
        p.requires_grad = True

    print("\tLearning Rate:", LEARNING_RATE)
    print("\tBatch Size:", batch_size)
    print()

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = WarmupMultiStepLR(optimizer,
                                  milestones=[],
                                  warmup_iters=len(train_loader))

    criterion = FocalLoss()

    USE_NORMAL_LOSS = True  # 截断大loss

    for epoch in range(max_epochs):
        if USE_NORMAL_LOSS:
            normal_loss = -1
        for it, images in enumerate(train_loader):
            layout_image = images[0].to(device)
            heat_image = images[1].to(device)
            m = model(heat_image)

            # loss a= F.binary_cross_entropy(m, layout_image, reduction="mean")
            loss = criterion(m, layout_image)

            if USE_NORMAL_LOSS and epoch > 0: