def train(train_set, val_set, data_info, params, embedding_size=300):
    # print("loading word2vec...")
    pretrained_w2v = loadpickle('/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/selftrained_extracted_w2v_wordnet_synsets_py3.pl')
    # pretrained_w2v = loadpickle('/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/googlenews_extracted_w2v_wordnet_synsets_py3.pl')

    #
    wv_matrix = []
    words_not_found = []
    for i in range(len(data_info["vocab"])):
        word = data_info["idx2tag"][i]
        if word in pretrained_w2v:
            wv_matrix.append(pretrained_w2v[word])
        else:
            words_not_found.append(word)
            # print("{} not found in dictrionary, will use random".format(word))
            wv_matrix.append(np.random.uniform(-0.01, 0.01, embedding_size).astype("float32"))
    print("{} words were not found".format(len(words_not_found)))
    # one for UNK and one for zero padding
    wv_matrix.append(np.random.uniform(-0.01, 0.01, embedding_size).astype("float32"))
    wv_matrix.append(np.zeros(embedding_size).astype("float32"))
    wv_matrix = np.array(wv_matrix)
    params["WV_MATRIX"] = wv_matrix

    model = CNN(**params).cuda(params["GPU"])

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    criterion = nn.CrossEntropyLoss()
    scheduler = lr_scheduler.MultiStepLR(
        optimizer, params['Learning_SCHEDULE'], gamma=0.1)
    max_dev_acc = 0
    max_test_acc = 0
    best_model = None
    model.train()
    for e in range(params["EPOCH"]):
        train_set = shuffle(train_set)
        train_losses = AverageMeter()
        train_accuracies = AverageMeter()
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        model.train()

        for i in tqdm.tqdm(range(0, len(train_set), params["BATCH_SIZE"])):
            batch_range = min(params["BATCH_SIZE"], len(train_set) - i)

            # add random dropping words:
            batch_x = []

            for sent in train_set[i:i + batch_range]:
                x_sent = sent[0]
                drop_thre = 0.2
                x_collected_words = []
                for x_word in x_sent:
                    p = random.uniform(0, 1)
                    if p >= drop_thre:
                        x_collected_words.append(data_info["tag2idx"][x_word])
                if len(x_collected_words) >= params["MAX_SENT_LEN"]:
                    batch_x.append(x_collected_words[:params["MAX_SENT_LEN"]])
                else:
                    batch_x.append(x_collected_words + [params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(x_collected_words)))

            batch_y_numpy = [c[1] for c in train_set[i:i + batch_range]]

            batch_x = Variable(torch.LongTensor(batch_x)).cuda(params["GPU"])
            batch_y = Variable(torch.FloatTensor(batch_y_numpy)).cuda(params["GPU"])

            optimizer.zero_grad()
            pred = model(batch_x)
            # loss = criterion(pred, batch_y)

            log_softmax_output = F.log_softmax(pred, dim=1)

            loss = - torch.sum(log_softmax_output * batch_y) / pred.shape[0]

            loss.backward()
            nn.utils.clip_grad_norm_(parameters, max_norm=params["NORM_LIMIT"])
            optimizer.step()
            pred = np.argmax(pred.cpu().data.numpy(), axis=1)

            acc = sum([1 if y[p] > 0 else 0 for p, y in zip(pred, batch_y_numpy)]) / len(pred)
            train_losses.update(loss.item(), batch_range)
            train_accuracies.update(acc, batch_range)


        dev_acc, dev_loss = test(val_set ,data_info, model, params, criterion)

        # if params["EARLY_STOPPING"] and dev_acc <= pre_dev_acc:
        #     print("early stopping by dev_acc!")
        #     break
        # else:
        #     pre_dev_acc = dev_acc

        if dev_acc > max_dev_acc:
            max_dev_acc = dev_acc
            best_model = copy.deepcopy(model)

        print("epoch:", e + 1, ' lr:', '{:.6f}'.format(current_lr),  " dev_acc:", dev_acc, ' dev_loss:', dev_loss, " train_acc:", train_accuracies.avg, ' train_losses:', train_losses.avg, ' max_dev_acc ', max_dev_acc)

    print("max dev acc:", max_dev_acc, "test acc:", max_test_acc)
    return best_model
Example #2
0
def train(train_set, val_set, args_data, args_model, args_hyper):
    wordvec_type = 'selftrained'  # or selftrained or googlenews
    wordvec_file = '/home/zwei/Dev/TextClassifications/z_implementations/pre_extract_w2v/params/' \
                   '{}_extracted_w2v_wordnet_synsets_py3.pl'.format(wordvec_type)

    print("Loading from {}".format(wordvec_type))
    pretrained_w2v = loadpickle(wordvec_file)

    #This is only creating a vocabulary that exists in th
    wv_matrix = []
    words_not_found = []
    for i in range(len(args_data.vocab)):
        word = args_data.idx2tag[i]
        if word in pretrained_w2v:
            wv_matrix.append(pretrained_w2v[word])
        else:
            words_not_found.append(word)
            # print("{} not found in dictrionary, will use random".format(word))
            wv_matrix.append(np.random.uniform(-0.01, 0.01, args_hyper.word_dim).astype("float32"))
    print("{} words were not found".format(len(words_not_found)))
    # one for UNK and one for zero padding
    wv_matrix.append(np.random.uniform(-0.01, 0.01, args_hyper.word_dim).astype("float32"))
    wv_matrix.append(np.zeros(args_hyper.word_dim).astype("float32"))
    wv_matrix = np.array(wv_matrix)

    model = TextCNN(args_model, init_wv=wv_matrix).cuda()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, args_hyper.learning_rate)


    scheduler = lr_scheduler.MultiStepLR(
        optimizer, [int(x) for x in args_hyper.lr_schedule.split(',')], gamma=0.1)
    max_dev_top1 = 0
    max_dev_hits = 0
    max_test_acc = 0
    best_model = None
    model.train()
    for e in range(args_hyper.epoch):
        train_set = shuffle(train_set)
        train_losses = AverageMeter()
        train_top1 = AverageMeter()
        train_hits = AverageMeter()
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        model.train()

        for i in tqdm.tqdm(range(0, len(train_set), args_hyper.batch_size)):
            batch_range = min(args_hyper.batch_size, len(train_set) - i)

            # add random dropping words:
            batch_x = []

            for sent in train_set[i:i + batch_range]:
                x_sent = sent[0]
                drop_thre = 0.2
                x_collected_words = []
                for x_word in x_sent:
                    p = random.uniform(0, 1)
                    if p >= drop_thre:
                        x_collected_words.append(args_data.tag2idx[x_word])
                if len(x_collected_words) >= args_model.max_len:
                    batch_x.append(x_collected_words[:args_model.max_len])
                else:
                    batch_x.append(x_collected_words + [args_model.vocab_size + 1] * (args_model.max_len - len(x_collected_words)))

            batch_y_numpy = [c[1] for c in train_set[i:i + batch_range]]

            batch_x = Variable(torch.LongTensor(batch_x)).cuda()
            batch_y = Variable(torch.FloatTensor(batch_y_numpy)).cuda()

            optimizer.zero_grad()
            model_output = model(batch_x)
            pred = model_output[0]
            raw_feature = model_output[-1]
            trs_feature = model_output[-2]
            log_softmax_output = F.log_softmax(pred, dim=1)
            loss_cls = - torch.sum(log_softmax_output * batch_y) / pred.shape[0]
            loss_l2 =  ((raw_feature - trs_feature)**2).mean()
            loss = loss_cls + loss_l2
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, max_norm=args_hyper.max_norm)
            optimizer.step()
            pred_idx = np.argmax(pred.cpu().data.numpy(), axis=1)
            train_losses.update(loss.item(), batch_range)

            # # FIXME: to top 1 and top-hit
            top1_batch, _, _ = multilabelTop1(pred_idx, batch_y_numpy)
            train_top1.update(top1_batch, len(pred_idx))

            hits_batch, _, _ = multilabelHits(pred_idx, batch_y_numpy )
            train_hits.update(hits_batch, len(pred_idx))


        dev_top1, dev_hits, dev_loss = test(val_set, model, args_data, args_model)

        if dev_top1 > max_dev_top1:
            max_dev_top1 = dev_top1
            best_model = copy.deepcopy(model)

        if dev_hits > max_dev_hits:
            max_dev_hits = dev_hits



        print('epoch: {} lr: {:.6f}, dev_top1: {:.2f}, dev_hits: {:.2f}, dev_loss: {:.2f}, '
               'train_top1: {:.2f}, train_hits: {:.2f}, train_loss:{:.2f}, max_dev_top1: {:.2f}, '
               'max_dev_hits: {:.2f}'.format(e+1, current_lr,
              dev_top1*100, dev_hits*100, dev_loss, train_top1.avg*100, train_hits.avg*100,
              train_losses.avg,  max_dev_top1*100, max_dev_hits*100))

    print("max dev top1: {:.2f},\tmax dev hits: {:.2f}".format(max_dev_top1, max_dev_hits))
    return best_model