Exemple #1
0
def train(rawdata, charcounts, maxlens, unique_onehotvals):
    mb_size = 256
    lr = 2.0e-4
    cnt = 0
    latent_dim = 32
    recurrent_hidden_size = 24

    epoch_len = 8
    max_veclen = 0.0
    patience = 12 * epoch_len
    patience_duration = 0

    # mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)

    input_dict = {}
    input_dict['discrete'] = discrete_cols
    input_dict['continuous'] = continuous_cols

    input_dict['onehot'] = {}
    for k in onehot_cols:
        dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0)))
        input_dict['onehot'][k] = dim

    if len(charcounts) > 0:
        text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0)))
        input_dict['text'] = {t: text_dim for t in text_cols}
    else:
        text_dim = 0
        input_dict['text'] = {}

    data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens)
    data_idx = np.arange(data.__len__())
    np.random.shuffle(data_idx)
    n_folds = 6
    fold_size = 1.0 * data.__len__() / n_folds
    folds = [data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6)]

    fold_groups = {}
    fold_groups[0] = {'train': [0, 1, 2, 4], 'es': [3], 'val': [5]}
    fold_groups[1] = {'train': [0, 2, 3, 5], 'es': [1], 'val': [4]}
    fold_groups[2] = {'train': [1, 3, 4, 5], 'es': [2], 'val': [0]}
    fold_groups[3] = {'train': [0, 2, 3, 4], 'es': [5], 'val': [1]}
    fold_groups[4] = {'train': [0, 1, 3, 5], 'es': [4], 'val': [2]}
    fold_groups[5] = {'train': [1, 2, 4, 5], 'es': [0], 'val': [3]}

    for fold in range(1):

        train_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['train']])))
        es_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['es']])))
        val_idx = np.array(folds[fold_groups[fold]['val'][0]])

        train = Subset(data, train_idx)
        es = Subset(data, es_idx)
        val = Subset(data, val_idx)

        kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
        train_iter = torch.utils.data.DataLoader(train, batch_size=mb_size, shuffle=True, **kwargs)
        es_iter = torch.utils.data.DataLoader(es, batch_size=mb_size, shuffle=True, **kwargs)
        val_iter = torch.utils.data.DataLoader(val, batch_size=mb_size, shuffle=True, **kwargs)

        embeddings = {}
        reverse_embeddings = {}
        onehot_embedding_weights = {}
        for k in onehot_cols:
            dim = input_dict['onehot'][k]
            onehot_embedding_weights[k] = net.get_embedding_weight(len(unique_onehotvals[k]), dim)
            if use_cuda:
                onehot_embedding_weights[k] = onehot_embedding_weights[k].cuda()
            #embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, max_norm=1.0)
            embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k], max_norm=1.0)
            reverse_embeddings[k] = net.EmbeddingToIndex(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k])

        if text_dim > 0:
            text_embedding_weights = net.get_embedding_weight(len(charcounts) + 1, text_dim)
            if use_cuda:
                text_embedding_weights = text_embedding_weights.cuda()
            #text_embedding = nn.Embedding(len(charcounts)+1, text_dim, max_norm=1.0)
            text_embedding = nn.Embedding(len(charcounts) + 1, text_dim, _weight=text_embedding_weights, max_norm=1.0)
            text_embeddingtoindex = net.EmbeddingToIndex(len(charcounts) + 1, text_dim, _weight=text_embedding_weights)
            for k in text_cols:
                embeddings[k] = text_embedding
                reverse_embeddings[k] = text_embeddingtoindex

        enc = net.Encoder(input_dict, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size)
        dec = net.Decoder(input_dict, maxlens, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size)

        if use_cuda:
            embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()}
            reverse_embeddings = {k: reverse_embeddings[k].cuda() for k in embeddings.keys()}
            enc.cuda()
            dec.cuda()


        #print(enc.parameters)
        #print(dec.parameters)


        contrastivec = contrastive.ContrastiveLoss(margin=margin)


        #solver = optim.RMSprop([p for em in embeddings.values() for p in em.parameters()] +  [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr)
        solver = optim.Adam(
            [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in
                                                                                                          dec.parameters()],
            lr=lr)

        Tsample = next(es_iter.__iter__())
        if use_cuda:
            Tsample = {col: Variable(tt[0:128]).cuda() for col, tt in Tsample.items()}
        else:
            Tsample = {col: Variable(tt[0:128]) for col, tt in Tsample.items()}

        print({col: tt[0] for col, tt in Tsample.items()})

        print('starting training')
        loss = 0.0
        for it in range(1000000):
            # X = Variable(torch.tensor(np.array([[1,2,4], [4,1,9]]))).cuda()
            batch_idx, T = next(enumerate(train_iter))
            if use_cuda:
                T = {col: Variable(tt).cuda() for col, tt in T.items()}
            else:
                T = {col: Variable(tt) for col, tt in T.items()}

            X = {}
            for col, tt in T.items():
                if col in embeddings.keys():
                    X[col] = embeddings[col](tt)
                else:
                    X[col] = tt.float()

            mu = enc(X)
            X2 = dec(mu)

            T2 = {}
            X2d = {col: (1.0 * tt).detach() for col, tt in X2.items()}


            for col, embedding in embeddings.items():
                T2[col] = reverse_embeddings[col](X2[col])
                X2[col] = 0.5*X2[col] + 0.5*embeddings[col](T2[col])
                X2d[col] = embeddings[col](T2[col].detach())



            '''
            X2d = {col: (1.0*tt).detach() for col, tt in X2.items()}
            T2 = discretize(X2d, embeddings, maxlens)
            for col, embedding in embeddings.items():
                X2d[col] = embeddings[col](T2[col].detach())
            '''
            '''
            T2 = discretize(X2, embeddings, maxlens)
            X2d = {col: (1.0*tt).detach() for col, tt in X2.items()}

            for col, embedding in embeddings.items():
                X2[col] = embeddings[col](T2[col]) #+0.05 X2[col]
                X2d[col] = embeddings[col](T2[col].detach())
            '''


            mu2 = enc(X2)
            mu2 = mu2.view(mb_size, -1)

            mu2d = enc(X2d)

            mu2d = mu2d.view(mb_size, -1)


            mu = mu.view(mb_size, -1)

            are_same = are_equal({col: x[::2] for col, x in T.items()}, {col: x[1::2] for col, x in T.items()})
            #print('f same ', torch.mean(torch.mean(are_same, 1)))
            #enc_loss = contrastivec(mu2[::2], mu2[1::2], torch.zeros(int(mb_size / 2)).cuda())
            enc_loss = contrastivec(mu[::2], mu[1::2], are_same)
            #enc_loss += 0.5*contrastivec(mu2[::2], mu2[1::2], are_same)
            #enc_loss += 0.5 * contrastivec(mu[::2], mu2[1::2], are_same)
            enc_loss += 1.0*contrastivec(mu, mu2, torch.ones(mb_size).cuda())
            enc_loss += 2.0*contrastivec(mu, mu2d, torch.zeros(mb_size).cuda())
            #enc_loss += 1.0 * contrastivec(mu2d[0::2], mu2d[1::2], torch.ones(int(mb_size/2)).cuda())
            #enc_loss += 1.0 * contrastivec(mu2d[::2], mu2d[1::2], torch.ones(int(mb_size / 2)).cuda())
            #enc_loss += 0.5 * contrastivec(mu2d[::2], mu2d[1::2], torch.ones(int(mb_size/2)).cuda())

            '''
            adotb = torch.matmul(mu, mu.permute(1, 0))  # batch_size x batch_size
            adota = torch.matmul(mu.view(-1, 1, latent_dim), mu.view(-1, latent_dim, 1))  # batch_size x 1 x 1
            diffsquares = (adota.view(-1, 1).repeat(1, mb_size) + adota.view(1, -1).repeat(mb_size, 1) - 2 * adotb) / latent_dim

            # did I f**k up something here? diffsquares can apparently be less than 0....
            mdist = torch.sqrt(torch.clamp(torch.triu(diffsquares, diagonal=1),  min=0.0))
            mdist = torch.clamp(margin - mdist, min=0.0)
            number_of_pairs = mb_size * (mb_size - 1) / 2

            enc_loss = 0.5 * torch.sum(torch.triu(torch.pow(mdist, 2), diagonal=1)) / number_of_pairs

            target = torch.ones(mu.size(0), 1)
            if use_cuda:
                target.cuda()
            enc_loss += contrastivec(mu, mu2, target.cuda())

            target = torch.zeros(mu.size(0), 1)
            if use_cuda:
                target.cuda()
            enc_loss += 2.0 * contrastivec(mu, mu2d, target.cuda())
            '''


            enc_loss.backward()
            solver.step()

            enc.zero_grad()
            dec.zero_grad()
            for col in embeddings.keys():
                embeddings[col].zero_grad()

            loss += enc_loss.data.cpu().numpy()
            veclen = torch.mean(torch.pow(mu, 2))
            if it % epoch_len == 0:
                print(it, loss/epoch_len, veclen.data.cpu().numpy()) #enc_loss.data.cpu().numpy(),

                Xsample = {}
                for col, tt in Tsample.items():
                    if col in embeddings.keys():
                        Xsample[col] = embeddings[col](tt)
                    else:
                        Xsample[col] = tt.float()

                mu = enc(Xsample)
                X2sample = dec(mu)
                X2sampled = {col: tt.detach() for col, tt in X2sample.items()}
                T2sample = discretize(X2sample, embeddings, maxlens)

                mu2 = enc(X2sample)
                mu2d = enc(X2sampled)


                if 'Fare' in continuous_cols and 'Age' in continuous_cols:
                    print([np.mean(np.abs(Xsample[col].data.cpu().numpy()-X2sample[col].data.cpu().numpy())) for col in ['Fare', 'Age']])

                print({col: tt[0:2].data.cpu().numpy() for col, tt in T2sample.items()})

                if 'Survived' in onehot_cols:
                    print('% survived correct: ', np.mean(T2sample['Survived'].data.cpu().numpy()==Tsample['Survived'].data.cpu().numpy()), np.mean(Tsample['Survived'].data.cpu().numpy()==np.ones_like(Tsample['Survived'].data.cpu().numpy())))

                if 'Cabin' in text_cols:
                    print(embeddings['Cabin'].weight[data.charindex['1']])



                are_same = are_equal({col: x[::2] for col, x in Tsample.items()}, {col: x[1::2] for col, x in Tsample.items()})
                # print('f same ', torch.mean(torch.mean(are_same, 1)))
                # enc_loss = contrastivec(mu2[::2], mu2[1::2], torch.zeros(int(mb_size / 2)).cuda())
                #es_loss = contrastivec(mu[::2], mu[1::2], are_same)
                # enc_loss += 0.25*contrastivec(mu2[::2], mu2[1::2], are_same)
                # enc_loss += 0.5 * contrastivec(mu[::2], mu2[1::2], are_same)
                es_loss = 1.0 * contrastivec(mu, mu2, torch.ones(mu.size(0)).cuda())
                #es_loss += 2.0 * contrastivec(mu, mu2d, torch.zeros(mu.size(0)).cuda())

                #print('mean mu ', torch.mean(torch.pow(mu, 2)))
                print('es loss ', es_loss)

                loss = 0.0
                #print(T2.data.cpu()[0, 0:30].numpy())
def do_train(rawdata, charcounts, maxlens, unique_onehotvals):
    n_batches = 2000
    mb_size = 128
    lr = 2.0e-4
    momentum = 0.5
    cnt = 0
    latent_dim = 32 #24#
    recurrent_hidden_size = 24

    epoch_len = 8
    max_veclen = 0.0
    patience = 12 * epoch_len
    patience_duration = 0

    # mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)

    input_dict = {}
    input_dict['discrete'] = discrete_cols
    input_dict['continuous'] = continuous_cols

    input_dict['onehot'] = {}
    for k in onehot_cols:
        dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0)))
        input_dict['onehot'][k] = dim

    if len(charcounts) > 0:
        text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0)))
        input_dict['text'] = {t: text_dim for t in text_cols}
    else:
        text_dim = 0
        input_dict['text'] = {}

    data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens)
    data_idx = np.arange(data.__len__())
    np.random.shuffle(data_idx)
    n_folds = 6
    fold_size = 1.0 * data.__len__() / n_folds
    folds = [data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6)]

    fold_groups = {}
    fold_groups[0] = {'train': [0, 1, 2, 4], 'es': [3], 'val': [5]}
    fold_groups[1] = {'train': [0, 2, 3, 5], 'es': [1], 'val': [4]}
    fold_groups[2] = {'train': [1, 3, 4, 5], 'es': [2], 'val': [0]}
    fold_groups[3] = {'train': [0, 2, 3, 4], 'es': [5], 'val': [1]}
    fold_groups[4] = {'train': [0, 1, 3, 5], 'es': [4], 'val': [2]}
    fold_groups[5] = {'train': [1, 2, 4, 5], 'es': [0], 'val': [3]}

    for fold in range(1):

        train_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['train']])))
        es_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['es']])))
        val_idx = np.array(folds[fold_groups[fold]['val'][0]])

        train = Subset(data, train_idx)
        es = Subset(data, es_idx)
        val = Subset(data, val_idx)

        kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
        train_iter = torch.utils.data.DataLoader(train, batch_size=int(mb_size/1), shuffle=True, **kwargs)
        train_iter_unshuffled = torch.utils.data.DataLoader(train, batch_size=mb_size, shuffle=False, **kwargs)
        es_iter = torch.utils.data.DataLoader(es, batch_size=mb_size, shuffle=True, **kwargs)
        val_iter = torch.utils.data.DataLoader(val, batch_size=mb_size, shuffle=True, **kwargs)

        embeddings = {}
        reverse_embeddings = {}
        onehot_embedding_weights = {}
        onehot_embedding_spread = {}
        for k in onehot_cols:
            dim = input_dict['onehot'][k]
            onehot_embedding_weights[k] = net.get_embedding_weight(len(unique_onehotvals[k]), dim, use_cuda=use_cuda)
            embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k])
            reverse_embeddings[k] = net.EmbeddingToIndex(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k])

        if text_dim > 0:
            text_embedding_weights = net.get_embedding_weight(len(charcounts) + 1, text_dim, use_cuda=use_cuda)
            text_embedding = nn.Embedding(len(charcounts) + 1, text_dim, _weight=text_embedding_weights)
            text_embeddingtoindex = net.EmbeddingToIndex(len(charcounts) + 1, text_dim, _weight=text_embedding_weights)
            for k in text_cols:
                embeddings[k] = text_embedding
                reverse_embeddings[k] = text_embeddingtoindex

        enc = net.Encoder(input_dict, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size)
        dec = net.Decoder(input_dict, maxlens, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size)

        if use_cuda:
            embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()}
            enc.cuda()
            dec.cuda()


        #print(enc.parameters)
        #print(dec.parameters)


        #contrastivec = contrastive.ContrastiveLoss(margin=margin)
        logloss = contrastive.GaussianOverlap()


        #solver = optim.RMSprop([p for em in embeddings.values() for p in em.parameters()] +  [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr)
        #solver = optim.Adam(
        #    [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in
        #                                                                                                  dec.parameters()],
        #    lr=lr)

        solver = optim.RMSprop(
            [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in
                                                                                                          dec.parameters()],
            lr=lr, momentum=momentum)

        Tsample = next(es_iter.__iter__())
        if use_cuda:
            Tsample = {col: Variable(tt).cuda() for col, tt in Tsample.items()}
        else:
            Tsample = {col: Variable(tt) for col, tt in Tsample.items()}

        print({col: tt[0] for col, tt in Tsample.items()})

        print('starting training')
        loss = 0.0
        loss0 = 0.0
        loss1 = 0.0
        loss2 = 0.0
        loss3 = 0.0

        logger_df = pd.DataFrame(columns=['iter', 'train_loss', 'train_veclen', 'es_veclen', 'Survived_correct', 'Survived_false'])

        for it in range(n_batches):
            # X = Variable(torch.tensor(np.array([[1,2,4], [4,1,9]]))).cuda()
            T = next(iter(train_iter))
            #for col, val in T.items():
            #    T[col] = torch.cat((val, val, val, val), 0)

            T, X, X2, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d = calc_mus(T, embeddings, reverse_embeddings, enc, dec)
            enc_loss, enc_loss0, enc_loss1, enc_loss2, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logloss)

            enc_loss.backward()
            solver.step()

            enc.zero_grad()
            dec.zero_grad()
            for col in embeddings.keys():
                embeddings[col].zero_grad()

            loss += enc_loss.data.cpu().numpy()
            loss0 += enc_loss0.data.cpu().numpy()
            loss1 += enc_loss1.data.cpu().numpy()
            loss2 += enc_loss2.data.cpu().numpy()
            loss3 += enc_loss3.data.cpu().numpy()
            veclen = torch.mean(torch.pow(mu, 2))
            if it % epoch_len == 0:
                print(it, loss/epoch_len, loss0/epoch_len, loss1/epoch_len, loss2/epoch_len, loss3/epoch_len, veclen.data.cpu().numpy()) #enc_loss.data.cpu().numpy(),


                if use_cuda:
                    mu = torch.zeros(len(train), mu.size(1)).cuda()
                    logvar = torch.zeros(len(train), mu.size(1)).cuda()
                    mu2 = torch.zeros(len(train), mu.size(1)).cuda()
                    mu2d = torch.zeros(len(train), mu.size(1)).cuda()
                    mu_tm = torch.zeros((len(train),) + mu_tm.size()[1:]).cuda()
                    logvar2 = torch.zeros(len(train), mu.size(1)).cuda()
                    logvar2d = torch.zeros(len(train), mu.size(1)).cuda()
                else:
                    mu = torch.zeros(len(train), mu.size(1))
                    logvar = torch.zeros(len(train), mu.size(1))
                    mu2 = torch.zeros(len(train), mu.size(1))
                    mu2d = torch.zeros(len(train), mu.size(1))
                    mu_tm = torch.zeros((len(train),) + mu_tm.size()[1:])
                    logvar2 = torch.zeros(len(train), mu.size(1))
                    logvar2d = torch.zeros(len(train), mu.size(1))

                s = 0
                for T0 in train_iter_unshuffled:
                    e = s + T0[to_predict[0]].size(0)
                    if s == 0:
                        T = {col : torch.zeros((len(train),) + val.size()[1:], dtype=val.dtype) for col, val in T0.items()}

                    T0, blah, bblah, mu[s:e], logvar[s:e], mu2[s:e], mu2d[s:e], mu_tm[s:e], logvar2[s:e], logvar2d[s:e] = calc_mus(T0, embeddings, reverse_embeddings,  enc, dec, mode='val')
                    for col, val in T0.items():
                        T[col][s:e] = T0[col]

                    s = e

                enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logloss, lookfordups=False)
                vl = torch.mean(torch.pow(mu, 2))

                print(f'train enc loss {enc_loss}')
                print(f'train veclen {vl}')
                print(f'mean train logvar {torch.mean(logvar)}')
                logger_df.loc[int(it/epoch_len), ['iter', 'train_loss', 'train_veclen']] = [it, enc_loss.data.cpu().numpy(), vl.data.cpu().numpy()]

                if use_cuda:
                    mu = torch.zeros(len(es), mu.size(1)).cuda()
                    logvar = torch.zeros(len(es), mu.size(1)).cuda()
                    mu2 = torch.zeros(len(es), mu.size(1)).cuda()
                    mu2d = torch.zeros(len(es), mu.size(1)).cuda()
                else:
                    mu = torch.zeros(len(es), mu.size(1))
                    logvar = torch.zeros(len(es), mu.size(1))
                    mu2 = torch.zeros(len(es), mu.size(1))
                    mu2d = torch.zeros(len(es), mu.size(1))

                s = 0
                targets = {}
                for T0 in es_iter:
                    e = s + T0[to_predict[0]].size(0)
                    if s == 0:
                        T = {col : torch.zeros((len(es),) + val.size()[1:], dtype=val.dtype) for col, val in T0.items()}
                        correct = {col: np.zeros((len(es),) + val.size()[1:]) for col, val in T0.items()}
                        actual = {col: np.zeros((len(es),) + val.size()[1:]) for col, val in T0.items()}

                    Xsample = {}
                    for col, tt in T0.items():
                        if use_cuda:
                            tt = Variable(tt).cuda()
                        else:
                            tt = Variable(tt)

                        if col in embeddings.keys():
                            Xsample[col] = embeddings[col](tt)
                        else:
                            Xsample[col] = tt.float()



                    for col in to_predict:
                        targets[col] = tt
                        Xsample[col] = 0.0 * Xsample[col]


                    mu[s:e], logvar[s:e] = enc(Xsample)

                    X2sample = dec(mu[s:e])
                    T2sample = discretize(X2sample, embeddings, maxlens)

                    mu2[s:e], _ = enc(X2sample)


                    T2 = {}
                    X2dsample = {col: (1.0 * tt).detach() for col, tt in X2sample.items()}
                    for col in continuous_cols:
                        if col in to_predict:
                            correct[col][s:e] = np.abs(X2sample[col].data.cpu().numpy().reshape(-1) - targets[
                                col].data.cpu().numpy().reshape(-1))
                            actual[col][s:e] = targets[col].data.cpu().numpy().reshape(-1)
                        else:
                            correct[col][s:e] = np.abs(X2sample[col].data.cpu().numpy().reshape(-1) - T0[
                                col].data.cpu().numpy().reshape(-1))
                            actual[col][s:e] = T0[col].data.cpu().numpy().reshape(-1)


                    for col, embedding in embeddings.items():
                        # T2[col] = reverse_embeddings[col](X2sample[col])
                        X2dsample[col] = embeddings[col](T2sample[col].detach())

                        if col in to_predict:
                            correct[col][s:e] = np.abs(T2sample[col].data.cpu().numpy() == targets[col].data.cpu().numpy())
                            actual[col][s:e] = targets[col].data.cpu().numpy().reshape(-1)
                        else:
                            correct[col][s:e] = np.abs(T2sample[col].data.cpu().numpy() == T0[col].data.cpu().numpy())
                            actual[col][s:e] = T0[col].data.cpu().numpy().reshape(-1)

                    mu2d[s:e], _ = enc(X2dsample)

                    s = e

                #enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logloss, lookfordups=False)
                #print(f'es enc loss {enc_loss}')
                vl = torch.mean(torch.pow(mu, 2))

                print(f'es veclen {vl}')
                print(f'mean es logvar {torch.mean(logvar)}')
                logger_df.loc[int(it/epoch_len), ['es_veclen', 'Survived_correct', 'Survived_false']] = vl.data.cpu().numpy(), np.mean(correct['Survived']), np.mean(actual['Survived']==0)


                for col in continuous_cols:
                    #print(np.abs(T0[col].data.cpu().numpy().reshape(-1) - T2sample[col].data.cpu().numpy().reshape(-1)))
                    print(f'% {col} mae: {np.mean(correct[col])}')

                for col in onehot_cols:
                    print(f'% {col} correct: {np.mean(correct[col])} {np.mean(actual[col]==0)}')



                '''
                for col in continuous_cols:
                    mae = np.mean(np.abs(X[col].data.cpu().numpy() - X2[col].data.cpu().numpy()))
                    mse = np.mean(np.square(X[col].data.cpu().numpy() - X2[col].data.cpu().numpy()))
                    print(f'train mae, mse {col} {mae} {mse}')
                    mae = np.mean(np.abs(Xsample[col].data.cpu().numpy() - X2sample[col].data.cpu().numpy()))
                    mse = np.mean(np.square(Xsample[col].data.cpu().numpy() - X2sample[col].data.cpu().numpy()))
                    print(f'val mae, mse {col} {mae} {mse}')

                print({col: tt[0:2].data.cpu().numpy() for col, tt in T2sample.items()})

                if 'Survived' in onehot_cols:
                    print('% survived correct: ', np.mean(T2sample['Survived'].data.cpu().numpy()==Tsample['Survived'].data.cpu().numpy()), np.mean(Tsample['Survived'].data.cpu().numpy()==np.ones_like(Tsample['Survived'].data.cpu().numpy())))

                if 'Sex' in onehot_cols:
                    print('% sex correct: ', np.mean(T2sample['Sex'].data.cpu().numpy()==Tsample['Sex'].data.cpu().numpy()), np.mean(Tsample['Sex'].data.cpu().numpy()==np.ones_like(Tsample['Sex'].data.cpu().numpy())))

                if 'Embarked' in onehot_cols:
                    print('% Embarked correct: ', np.mean(T2sample['Embarked'].data.cpu().numpy()==Tsample['Embarked'].data.cpu().numpy()) )
                    print(onehot_embedding_weights['Embarked'])

                if 'Pclass' in onehot_cols:
                    print('% Pclass correct: ',
                          np.mean(T2sample['Pclass'].data.cpu().numpy() == Tsample['Pclass'].data.cpu().numpy()))

                if 'Cabin' in text_cols:
                    print(embeddings['Cabin'].weight[data.charindex['1']])

                
                if 'Pclass' in onehot_cols:
                    diff = torch.mean(torch.pow(embeddings['Pclass'].weight - reverse_embeddings['Pclass'].weight, 2)).data.cpu().numpy()
                    print(f'diff plcass emb and reverse_emb: {diff}')
                    print(embeddings['Pclass'].weight.data.cpu().numpy())
                '''





                loss = 0.0
                loss0 = 0.0
                loss1 = 0.0
                loss2 = 0.0
                loss3 = 0.0
                #print(T2.data.cpu()[0, 0:30].numpy())

        logger_df.to_csv('logger_'+str(fold)+'.csv', index=False)
Exemple #3
0
def do_train(rawdata, charcounts, maxlens, unique_onehotvals):
    train_f_labeled = 0.2
    n_batches = 2800
    mb_size = 128
    lr = 2.0e-4
    momentum = 0.5
    cnt = 0
    latent_dim = 32  # 24#
    recurrent_hidden_size = 24

    epoch_len = 8
    max_veclen = 0.0
    patience = 12 * epoch_len
    patience_duration = 0

    input_dict = {}
    input_dict['discrete'] = discrete_cols
    input_dict['continuous'] = continuous_cols

    input_dict['onehot'] = {}
    for k in onehot_cols:
        dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0)))
        input_dict['onehot'][k] = dim

    if len(charcounts) > 0:
        text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0)))
        input_dict['text'] = {t: text_dim for t in text_cols}
    else:
        text_dim = 0
        input_dict['text'] = {}

    #data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens)
    #data_idx = np.arange(data.__len__())
    data_idx = np.arange(rawdata.shape[0])
    np.random.shuffle(data_idx)
    n_folds = 6
    fold_size = 1.0 * rawdata.shape[0] / n_folds  #data.__len__() / n_folds
    folds = [
        data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6)
    ]

    fold_groups = {}
    fold_groups[0] = {'train': [0, 1, 2, 3], 'val': [4]}
    fold_groups[1] = {'train': [1, 2, 3, 4], 'val': [0]}
    fold_groups[2] = {'train': [0, 2, 3, 4], 'val': [1]}
    fold_groups[3] = {'train': [0, 1, 3, 4], 'val': [2]}
    fold_groups[4] = {'train': [0, 1, 2, 4], 'val': [3]}

    for fold in range(1):

        train_idx = np.array(
            list(
                itertools.chain.from_iterable(
                    [folds[i] for i in fold_groups[fold]['train']])))
        val_idx = np.array(
            list(
                itertools.chain.from_iterable(
                    [folds[i] for i in fold_groups[fold]['val']])))

        np.random.shuffle(train_idx)
        train_labeled_idx = train_idx[0:int(train_f_labeled * len(train_idx))]
        train_unlabed_idx = train_idx[int(train_f_labeled * len(train_idx)):]

        data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals,
                       maxlens)
        train = Subset(data, train_idx)
        val = Subset(data, val_idx)

        kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
        train_iter = torch.utils.data.DataLoader(train,
                                                 batch_size=int(mb_size / 1),
                                                 shuffle=True,
                                                 **kwargs)
        train_iter_unshuffled = torch.utils.data.DataLoader(train,
                                                            batch_size=mb_size,
                                                            shuffle=False,
                                                            **kwargs)
        val_iter = torch.utils.data.DataLoader(val,
                                               batch_size=mb_size,
                                               shuffle=False,
                                               **kwargs)

        embeddings = {}
        reverse_embeddings = {}
        onehot_embedding_weights = {}
        for k in onehot_cols:
            dim = input_dict['onehot'][k]
            onehot_embedding_weights[k] = net.get_embedding_weight(
                len(unique_onehotvals[k]), dim, use_cuda=use_cuda)
            embeddings[k] = nn.Embedding(len(unique_onehotvals[k]),
                                         dim,
                                         _weight=onehot_embedding_weights[k])
            reverse_embeddings[k] = net.EmbeddingToIndex(
                len(unique_onehotvals[k]),
                dim,
                _weight=onehot_embedding_weights[k])

        if text_dim > 0:
            text_embedding_weights = net.get_embedding_weight(
                len(charcounts) + 1, text_dim, use_cuda=use_cuda)
            text_embedding = nn.Embedding(len(charcounts) + 1,
                                          text_dim,
                                          _weight=text_embedding_weights)
            text_embeddingtoindex = net.EmbeddingToIndex(
                len(charcounts) + 1, text_dim, _weight=text_embedding_weights)
            for k in text_cols:
                embeddings[k] = text_embedding
                reverse_embeddings[k] = text_embeddingtoindex

        enc = net.Encoder(input_dict,
                          dim=latent_dim,
                          recurrent_hidden_size=recurrent_hidden_size)
        dec = net.Decoder(input_dict,
                          maxlens,
                          dim=latent_dim,
                          recurrent_hidden_size=recurrent_hidden_size)

        if use_cuda:
            embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()}
            enc.cuda()
            dec.cuda()

        logloss = contrastive.GaussianOverlap()

        solver = optim.RMSprop(
            [p for em in embeddings.values() for p in em.parameters()] +
            [p for p in enc.parameters()] + [p for p in dec.parameters()],
            lr=lr,
            momentum=momentum)

        print('starting training')
        loss = 0.0
        loss0 = 0.0
        loss1 = 0.0
        loss2 = 0.0
        loss3 = 0.0

        logger_df = pd.DataFrame(columns=[
            'iter', 'train_loss', 'train_veclen', 'val_veclen', 'val_loss',
            'val_acc'
        ] + [t + '_correct'
             for t in to_predict] + [t + '_false' for t in to_predict])

        for it in range(n_batches):
            T = next(iter(train_iter))
            # for col, value in T.items():
            #    T[col] = torch.cat((value, value, value, value), 0)

            T, X, X2, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logvar_tm = calc_mus(
                T, embeddings, reverse_embeddings, enc, dec)
            enc_loss, enc_loss0, enc_loss1, enc_loss2, enc_loss3 = calc_losses(
                T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d,
                logvar_tm, logloss)

            enc_loss.backward()
            solver.step()

            enc.zero_grad()
            dec.zero_grad()
            for col in embeddings.keys():
                embeddings[col].zero_grad()

            loss += enc_loss.data.cpu().numpy()
            loss0 += enc_loss0.data.cpu().numpy()
            loss1 += enc_loss1.data.cpu().numpy()
            loss2 += enc_loss2.data.cpu().numpy()
            loss3 += enc_loss3.data.cpu().numpy()
            veclen = torch.mean(torch.pow(mu, 2))
            if it % epoch_len == 0:
                print(
                    it, loss / epoch_len, loss0 / epoch_len, loss1 / epoch_len,
                    loss2 / epoch_len, loss3 / epoch_len,
                    veclen.data.cpu().numpy())  # enc_loss.data.cpu().numpy(),

                n_targetvals = embeddings[to_predict[0]].weight.size(0)
                if use_cuda:
                    mu = torch.zeros(len(train), mu.size(1)).cuda()
                    logvar = torch.zeros(len(train), mu.size(1)).cuda()
                    mu2 = torch.zeros(len(train), mu.size(1)).cuda()
                    mu2d = torch.zeros(len(train), mu.size(1)).cuda()
                    mu_tm = torch.zeros((len(train), ) +
                                        mu_tm.size()[1:]).cuda()
                    logvar2 = torch.zeros(len(train), mu.size(1)).cuda()
                    logvar2d = torch.zeros(len(train), mu.size(1)).cuda()
                    logvar_tm = torch.zeros(len(train), 1 + n_targetvals,
                                            mu.size(1)).cuda()
                    train_loss = torch.zeros(len(train)).cuda()
                else:
                    mu = torch.zeros(len(train), mu.size(1))
                    logvar = torch.zeros(len(train), mu.size(1))
                    mu2 = torch.zeros(len(train), mu.size(1))
                    mu2d = torch.zeros(len(train), mu.size(1))
                    mu_tm = torch.zeros((len(train), ) + mu_tm.size()[1:])
                    logvar2 = torch.zeros(len(train), mu.size(1))
                    logvar2d = torch.zeros(len(train), mu.size(1))
                    logvar_tm = torch.zeros(len(train), 1 + n_targetvals,
                                            mu.size(1))
                    train_loss = torch.zeros(len(train))

                s = 0
                for T0 in train_iter_unshuffled:
                    e = s + T0[to_predict[0]].size(0)
                    if s == 0:
                        T = {
                            col: torch.zeros((len(train), ) + value.size()[1:],
                                             dtype=value.dtype)
                            for col, value in T0.items()
                        }

                    T0, Xsample, _, mu[s:e], logvar[s:e], mu2[s:e], mu2d[
                        s:e], mu_tm[s:e], logvar2[s:e], logvar2d[
                            s:e], logvar_tm[s:e] = calc_mus(T0,
                                                            embeddings,
                                                            reverse_embeddings,
                                                            enc,
                                                            dec,
                                                            mode='val')
                    for col, value in T0.items():
                        T[col][s:e] = T0[col]

                    n_targetvals = embeddings[to_predict[0]].weight.size(0)
                    mu_tm[s:e, 0, :] = 1.0 * mu[s:e]
                    p = torch.zeros((e - s), n_targetvals).cuda()

                    # encodings for all the possible target embedding values
                    for i in range(n_targetvals):
                        if use_cuda:
                            t = {
                                col: Xsample[col]
                                if not col in to_predict else embeddings[col](
                                    i * torch.ones_like(T0[col]).cuda())
                                for col in Xsample.keys()
                            }
                            mu_tm[s:e, i + 1, :], _ = enc(t)
                        else:
                            mu_tm[s:e, i + 1, :], _ = enc({
                                col: Xsample[col] if not col in to_predict else
                                embeddings[col](i * torch.ones_like(T0[col]))
                                for col in Xsample.keys()
                            })
                        diffsquares = torch.sqrt(
                            torch.mean(
                                torch.pow(
                                    mu_tm[s:e, 0, :] - mu_tm[s:e, i + 1, :],
                                    2), 1))
                        p[:, i] = 1.0 - torch.abs(torch.erf(diffsquares / 2.0))

                    labels = T0[to_predict[0]]
                    target = torch.zeros(e - s, n_targetvals)
                    target[torch.arange(e - s), labels] = 1
                    target = target.cuda()

                    #print(target[0:5])
                    #print(p[0:5])
                    p = p / torch.sum(p, 1).view(-1, 1).repeat(1, n_targetvals)

                    train_loss[s:e] += -torch.mean(
                        target * torch.log(torch.clamp(p, 1e-8, 1.0)) +
                        (1 - target) *
                        torch.log(torch.clamp(1 - p, 1e-8, 1.0)), 1)

                    s = e

                enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses(
                    T,
                    embeddings,
                    mu,
                    logvar,
                    mu2,
                    mu2d,
                    mu_tm,
                    logvar2,
                    logvar2d,
                    logvar_tm,
                    logloss,
                    lookfordups=False)
                vl = torch.mean(torch.pow(mu, 2))

                print(f'train enc loss {enc_loss}')
                print(f'train veclen {vl}')
                print(f'mean train logvar {torch.mean(logvar)}')
                print(f'mean train_loss {torch.mean(train_loss)}')
                logger_df.loc[
                    int(it / epoch_len),
                    ['iter', 'train_loss', 'train_veclen', 'train_loss']] = [
                        it,
                        enc_loss.data.cpu().numpy(),
                        vl.data.cpu().numpy(),
                        torch.mean(train_loss).data.cpu().numpy()
                    ]

                if use_cuda:
                    mu = torch.zeros(len(val), mu.size(1)).cuda()
                    logvar = torch.zeros(len(val), mu.size(1)).cuda()
                    mu2 = torch.zeros(len(val), mu.size(1)).cuda()
                    mu2d = torch.zeros(len(val), mu.size(1)).cuda()
                    n_targetvals = embeddings[to_predict[0]].weight.size(0)
                    mu_tm = torch.zeros(len(val), 1 + n_targetvals,
                                        mu.size(1)).cuda()
                    val_loss = torch.zeros(len(val)).cuda()
                    val_accuracy = torch.zeros(len(val)).cuda()
                else:
                    mu = torch.zeros(len(val), mu.size(1))
                    logvar = torch.zeros(len(val), mu.size(1))
                    mu2 = torch.zeros(len(val), mu.size(1))
                    mu2d = torch.zeros(len(val), mu.size(1))
                    n_targetvals = embeddings[to_predict[0]].weight.size(0)
                    mu_tm = torch.zeros(len(val), 1 + n_targetvals, mu.size(1))
                    val_loss = torch.zeros(len(val))
                    val_accuracy = torch.zeros(len(val))

                s = 0
                targets = {}
                for T0 in val_iter:
                    e = s + T0[to_predict[0]].size(0)
                    print(s, e)

                    if s == 0:
                        correct = {
                            col: np.zeros((len(val), ) + v.size()[1:])
                            for col, v in T0.items()
                        }
                        actual = {
                            col: np.zeros((len(val), ) + v.size()[1:])
                            for col, v in T0.items()
                        }

                    Xsample = {}
                    for col, tt in T0.items():
                        if use_cuda:
                            tt = Variable(tt).cuda()
                        else:
                            tt = Variable(tt)

                        if col in embeddings.keys():
                            Xsample[col] = embeddings[col](tt)
                        else:
                            Xsample[col] = tt.float()

                        if col in to_predict:
                            targets[col] = tt
                            Xsample[col] = 0.0 * Xsample[col]

                    mu[s:e], logvar[s:e] = enc(Xsample)

                    X2sample = dec(mu[s:e])
                    T2sample = discretize(X2sample, embeddings, maxlens)

                    mu2[s:e], _ = enc(X2sample)

                    X2dsample = {
                        col: (1.0 * tt).detach()
                        for col, tt in X2sample.items()
                    }
                    for col in continuous_cols:
                        if col in to_predict:
                            correct[col][s:e] = np.abs(
                                X2sample[col].data.cpu().numpy().reshape(-1) -
                                targets[col].data.cpu().numpy().reshape(-1))
                            actual[col][s:e] = targets[col].data.cpu().numpy(
                            ).reshape(-1)
                        else:
                            correct[col][s:e] = np.abs(
                                X2sample[col].data.cpu().numpy().reshape(-1) -
                                T0[col].data.cpu().numpy().reshape(-1))
                            actual[col][s:e] = T0[col].data.cpu().numpy(
                            ).reshape(-1)

                    for col, embedding in embeddings.items():
                        # T2[col] = reverse_embeddings[col](X2sample[col])
                        X2dsample[col] = embeddings[col](
                            T2sample[col].detach())

                        if col in to_predict:
                            correct[col][s:e] = np.abs(T2sample[col].data.cpu(
                            ).numpy() == targets[col].data.cpu().numpy())
                            actual[col][s:e] = targets[col].data.cpu().numpy(
                            ).reshape(-1)
                        else:
                            correct[col][s:e] = np.abs(T2sample[col].data.cpu(
                            ).numpy() == T0[col].data.cpu().numpy())
                            actual[col][s:e] = T0[col].data.cpu().numpy(
                            ).reshape(-1)

                    mu2d[s:e], _ = enc(X2dsample)
                    '''
                    calculate target predictions for validation data
                    '''

                    n_targetvals = embeddings[to_predict[0]].weight.size(0)
                    mu_tm[s:e, 0, :] = 1.0 * mu[s:e]
                    if use_cuda:
                        p = torch.zeros((e - s), n_targetvals).cuda()
                    else:
                        p = torch.zeros((e - s), n_targetvals)

                    # generate encodings for all the possible target embedding values
                    for i in range(n_targetvals):
                        if use_cuda:
                            t = {
                                col: Xsample[col]
                                if not col in to_predict else embeddings[col](
                                    i * torch.ones_like(T0[col]).cuda())
                                for col in Xsample.keys()
                            }
                            mu_tm[s:e, i + 1, :], _ = enc(t)
                        else:
                            mu_tm[s:e, i + 1, :], _ = enc({
                                col: Xsample[col] if not col in to_predict else
                                embeddings[col](i * torch.ones_like(T0[col]))
                                for col in Xsample.keys()
                            })
                        diffsquares = torch.sqrt(
                            torch.mean(
                                torch.pow(
                                    mu_tm[s:e, 0, :] - mu_tm[s:e, i + 1, :],
                                    2), 1))
                        p[:, i] = 1.0 - torch.abs(torch.erf(diffsquares / 2.0))

                        #print(mu_tm[s:s+5, i + 1, 0:5])
                        print(diffsquares[0:5])

                    labels = T0[to_predict[0]]
                    target = torch.zeros(e - s, n_targetvals)
                    target[torch.arange(e - s), labels] = 1
                    if use_cuda:
                        target = target.cuda()
                        labels = labels.cuda()

                    p = p / torch.sum(p, 1).view(-1, 1).repeat(1, n_targetvals)
                    val_accuracy[s:e] = torch.eq(labels,
                                                 torch.max(p, 1)[1]).float()

                    val_loss[s:e] += -torch.mean(
                        target * torch.log(torch.clamp(p, 1e-8, 1.0)) +
                        (1 - target) *
                        torch.log(torch.clamp(1 - p, 1e-8, 1.0)), 1)

                    s = e

                vl = torch.mean(torch.pow(mu, 2))

                print(f'val veclen {vl}')
                print(f'mean es logvar {torch.mean(logvar)}')
                print(f'mean val_loss {torch.mean(val_loss)}')
                print(f'mean val_accuracy {torch.mean(val_accuracy)}')

                logger_df.loc[
                    int(it / epoch_len),
                    ['val_veclen', 'val_loss', 'val_acc']] = vl.data.cpu(
                    ).numpy(), torch.mean(val_loss).data.cpu().numpy(
                    ), torch.mean(val_accuracy).data.cpu().numpy()
                for target_col in to_predict:
                    logger_df.loc[
                        int(it / epoch_len),
                        [target_col + '_correct', target_col +
                         '_false']] = np.mean(correct[target_col]), np.mean(
                             actual[target_col] == 0)

                for col in continuous_cols:
                    # print(np.abs(T0[col].data.cpu().numpy().reshape(-1) - T2sample[col].data.cpu().numpy().reshape(-1)))
                    print(f'% {col} mae: {np.mean(correct[col])}')

                for col in onehot_cols:
                    print(
                        f'% {col} correct: {np.mean(correct[col])} {np.mean(actual[col]==0)}'
                    )

                loss = 0.0
                loss0 = 0.0
                loss1 = 0.0
                loss2 = 0.0
                loss3 = 0.0
                # print(T2.data.cpu()[0, 0:30].numpy())

        logger_df.to_csv('logger_' + str(fold) + '.csv', index=False)