def train(rawdata, charcounts, maxlens, unique_onehotvals): mb_size = 256 lr = 2.0e-4 cnt = 0 latent_dim = 32 recurrent_hidden_size = 24 epoch_len = 8 max_veclen = 0.0 patience = 12 * epoch_len patience_duration = 0 # mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True) input_dict = {} input_dict['discrete'] = discrete_cols input_dict['continuous'] = continuous_cols input_dict['onehot'] = {} for k in onehot_cols: dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0))) input_dict['onehot'][k] = dim if len(charcounts) > 0: text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0))) input_dict['text'] = {t: text_dim for t in text_cols} else: text_dim = 0 input_dict['text'] = {} data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens) data_idx = np.arange(data.__len__()) np.random.shuffle(data_idx) n_folds = 6 fold_size = 1.0 * data.__len__() / n_folds folds = [data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6)] fold_groups = {} fold_groups[0] = {'train': [0, 1, 2, 4], 'es': [3], 'val': [5]} fold_groups[1] = {'train': [0, 2, 3, 5], 'es': [1], 'val': [4]} fold_groups[2] = {'train': [1, 3, 4, 5], 'es': [2], 'val': [0]} fold_groups[3] = {'train': [0, 2, 3, 4], 'es': [5], 'val': [1]} fold_groups[4] = {'train': [0, 1, 3, 5], 'es': [4], 'val': [2]} fold_groups[5] = {'train': [1, 2, 4, 5], 'es': [0], 'val': [3]} for fold in range(1): train_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['train']]))) es_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['es']]))) val_idx = np.array(folds[fold_groups[fold]['val'][0]]) train = Subset(data, train_idx) es = Subset(data, es_idx) val = Subset(data, val_idx) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_iter = torch.utils.data.DataLoader(train, batch_size=mb_size, shuffle=True, **kwargs) es_iter = torch.utils.data.DataLoader(es, batch_size=mb_size, shuffle=True, **kwargs) val_iter = torch.utils.data.DataLoader(val, batch_size=mb_size, shuffle=True, **kwargs) embeddings = {} reverse_embeddings = {} onehot_embedding_weights = {} for k in onehot_cols: dim = input_dict['onehot'][k] onehot_embedding_weights[k] = net.get_embedding_weight(len(unique_onehotvals[k]), dim) if use_cuda: onehot_embedding_weights[k] = onehot_embedding_weights[k].cuda() #embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, max_norm=1.0) embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k], max_norm=1.0) reverse_embeddings[k] = net.EmbeddingToIndex(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k]) if text_dim > 0: text_embedding_weights = net.get_embedding_weight(len(charcounts) + 1, text_dim) if use_cuda: text_embedding_weights = text_embedding_weights.cuda() #text_embedding = nn.Embedding(len(charcounts)+1, text_dim, max_norm=1.0) text_embedding = nn.Embedding(len(charcounts) + 1, text_dim, _weight=text_embedding_weights, max_norm=1.0) text_embeddingtoindex = net.EmbeddingToIndex(len(charcounts) + 1, text_dim, _weight=text_embedding_weights) for k in text_cols: embeddings[k] = text_embedding reverse_embeddings[k] = text_embeddingtoindex enc = net.Encoder(input_dict, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) dec = net.Decoder(input_dict, maxlens, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) if use_cuda: embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()} reverse_embeddings = {k: reverse_embeddings[k].cuda() for k in embeddings.keys()} enc.cuda() dec.cuda() #print(enc.parameters) #print(dec.parameters) contrastivec = contrastive.ContrastiveLoss(margin=margin) #solver = optim.RMSprop([p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr) solver = optim.Adam( [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr) Tsample = next(es_iter.__iter__()) if use_cuda: Tsample = {col: Variable(tt[0:128]).cuda() for col, tt in Tsample.items()} else: Tsample = {col: Variable(tt[0:128]) for col, tt in Tsample.items()} print({col: tt[0] for col, tt in Tsample.items()}) print('starting training') loss = 0.0 for it in range(1000000): # X = Variable(torch.tensor(np.array([[1,2,4], [4,1,9]]))).cuda() batch_idx, T = next(enumerate(train_iter)) if use_cuda: T = {col: Variable(tt).cuda() for col, tt in T.items()} else: T = {col: Variable(tt) for col, tt in T.items()} X = {} for col, tt in T.items(): if col in embeddings.keys(): X[col] = embeddings[col](tt) else: X[col] = tt.float() mu = enc(X) X2 = dec(mu) T2 = {} X2d = {col: (1.0 * tt).detach() for col, tt in X2.items()} for col, embedding in embeddings.items(): T2[col] = reverse_embeddings[col](X2[col]) X2[col] = 0.5*X2[col] + 0.5*embeddings[col](T2[col]) X2d[col] = embeddings[col](T2[col].detach()) ''' X2d = {col: (1.0*tt).detach() for col, tt in X2.items()} T2 = discretize(X2d, embeddings, maxlens) for col, embedding in embeddings.items(): X2d[col] = embeddings[col](T2[col].detach()) ''' ''' T2 = discretize(X2, embeddings, maxlens) X2d = {col: (1.0*tt).detach() for col, tt in X2.items()} for col, embedding in embeddings.items(): X2[col] = embeddings[col](T2[col]) #+0.05 X2[col] X2d[col] = embeddings[col](T2[col].detach()) ''' mu2 = enc(X2) mu2 = mu2.view(mb_size, -1) mu2d = enc(X2d) mu2d = mu2d.view(mb_size, -1) mu = mu.view(mb_size, -1) are_same = are_equal({col: x[::2] for col, x in T.items()}, {col: x[1::2] for col, x in T.items()}) #print('f same ', torch.mean(torch.mean(are_same, 1))) #enc_loss = contrastivec(mu2[::2], mu2[1::2], torch.zeros(int(mb_size / 2)).cuda()) enc_loss = contrastivec(mu[::2], mu[1::2], are_same) #enc_loss += 0.5*contrastivec(mu2[::2], mu2[1::2], are_same) #enc_loss += 0.5 * contrastivec(mu[::2], mu2[1::2], are_same) enc_loss += 1.0*contrastivec(mu, mu2, torch.ones(mb_size).cuda()) enc_loss += 2.0*contrastivec(mu, mu2d, torch.zeros(mb_size).cuda()) #enc_loss += 1.0 * contrastivec(mu2d[0::2], mu2d[1::2], torch.ones(int(mb_size/2)).cuda()) #enc_loss += 1.0 * contrastivec(mu2d[::2], mu2d[1::2], torch.ones(int(mb_size / 2)).cuda()) #enc_loss += 0.5 * contrastivec(mu2d[::2], mu2d[1::2], torch.ones(int(mb_size/2)).cuda()) ''' adotb = torch.matmul(mu, mu.permute(1, 0)) # batch_size x batch_size adota = torch.matmul(mu.view(-1, 1, latent_dim), mu.view(-1, latent_dim, 1)) # batch_size x 1 x 1 diffsquares = (adota.view(-1, 1).repeat(1, mb_size) + adota.view(1, -1).repeat(mb_size, 1) - 2 * adotb) / latent_dim # did I f**k up something here? diffsquares can apparently be less than 0.... mdist = torch.sqrt(torch.clamp(torch.triu(diffsquares, diagonal=1), min=0.0)) mdist = torch.clamp(margin - mdist, min=0.0) number_of_pairs = mb_size * (mb_size - 1) / 2 enc_loss = 0.5 * torch.sum(torch.triu(torch.pow(mdist, 2), diagonal=1)) / number_of_pairs target = torch.ones(mu.size(0), 1) if use_cuda: target.cuda() enc_loss += contrastivec(mu, mu2, target.cuda()) target = torch.zeros(mu.size(0), 1) if use_cuda: target.cuda() enc_loss += 2.0 * contrastivec(mu, mu2d, target.cuda()) ''' enc_loss.backward() solver.step() enc.zero_grad() dec.zero_grad() for col in embeddings.keys(): embeddings[col].zero_grad() loss += enc_loss.data.cpu().numpy() veclen = torch.mean(torch.pow(mu, 2)) if it % epoch_len == 0: print(it, loss/epoch_len, veclen.data.cpu().numpy()) #enc_loss.data.cpu().numpy(), Xsample = {} for col, tt in Tsample.items(): if col in embeddings.keys(): Xsample[col] = embeddings[col](tt) else: Xsample[col] = tt.float() mu = enc(Xsample) X2sample = dec(mu) X2sampled = {col: tt.detach() for col, tt in X2sample.items()} T2sample = discretize(X2sample, embeddings, maxlens) mu2 = enc(X2sample) mu2d = enc(X2sampled) if 'Fare' in continuous_cols and 'Age' in continuous_cols: print([np.mean(np.abs(Xsample[col].data.cpu().numpy()-X2sample[col].data.cpu().numpy())) for col in ['Fare', 'Age']]) print({col: tt[0:2].data.cpu().numpy() for col, tt in T2sample.items()}) if 'Survived' in onehot_cols: print('% survived correct: ', np.mean(T2sample['Survived'].data.cpu().numpy()==Tsample['Survived'].data.cpu().numpy()), np.mean(Tsample['Survived'].data.cpu().numpy()==np.ones_like(Tsample['Survived'].data.cpu().numpy()))) if 'Cabin' in text_cols: print(embeddings['Cabin'].weight[data.charindex['1']]) are_same = are_equal({col: x[::2] for col, x in Tsample.items()}, {col: x[1::2] for col, x in Tsample.items()}) # print('f same ', torch.mean(torch.mean(are_same, 1))) # enc_loss = contrastivec(mu2[::2], mu2[1::2], torch.zeros(int(mb_size / 2)).cuda()) #es_loss = contrastivec(mu[::2], mu[1::2], are_same) # enc_loss += 0.25*contrastivec(mu2[::2], mu2[1::2], are_same) # enc_loss += 0.5 * contrastivec(mu[::2], mu2[1::2], are_same) es_loss = 1.0 * contrastivec(mu, mu2, torch.ones(mu.size(0)).cuda()) #es_loss += 2.0 * contrastivec(mu, mu2d, torch.zeros(mu.size(0)).cuda()) #print('mean mu ', torch.mean(torch.pow(mu, 2))) print('es loss ', es_loss) loss = 0.0 #print(T2.data.cpu()[0, 0:30].numpy())
def do_train(rawdata, charcounts, maxlens, unique_onehotvals): n_batches = 2000 mb_size = 128 lr = 2.0e-4 momentum = 0.5 cnt = 0 latent_dim = 32 #24# recurrent_hidden_size = 24 epoch_len = 8 max_veclen = 0.0 patience = 12 * epoch_len patience_duration = 0 # mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True) input_dict = {} input_dict['discrete'] = discrete_cols input_dict['continuous'] = continuous_cols input_dict['onehot'] = {} for k in onehot_cols: dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0))) input_dict['onehot'][k] = dim if len(charcounts) > 0: text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0))) input_dict['text'] = {t: text_dim for t in text_cols} else: text_dim = 0 input_dict['text'] = {} data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens) data_idx = np.arange(data.__len__()) np.random.shuffle(data_idx) n_folds = 6 fold_size = 1.0 * data.__len__() / n_folds folds = [data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6)] fold_groups = {} fold_groups[0] = {'train': [0, 1, 2, 4], 'es': [3], 'val': [5]} fold_groups[1] = {'train': [0, 2, 3, 5], 'es': [1], 'val': [4]} fold_groups[2] = {'train': [1, 3, 4, 5], 'es': [2], 'val': [0]} fold_groups[3] = {'train': [0, 2, 3, 4], 'es': [5], 'val': [1]} fold_groups[4] = {'train': [0, 1, 3, 5], 'es': [4], 'val': [2]} fold_groups[5] = {'train': [1, 2, 4, 5], 'es': [0], 'val': [3]} for fold in range(1): train_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['train']]))) es_idx = np.array(list(itertools.chain.from_iterable([folds[i] for i in fold_groups[fold]['es']]))) val_idx = np.array(folds[fold_groups[fold]['val'][0]]) train = Subset(data, train_idx) es = Subset(data, es_idx) val = Subset(data, val_idx) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_iter = torch.utils.data.DataLoader(train, batch_size=int(mb_size/1), shuffle=True, **kwargs) train_iter_unshuffled = torch.utils.data.DataLoader(train, batch_size=mb_size, shuffle=False, **kwargs) es_iter = torch.utils.data.DataLoader(es, batch_size=mb_size, shuffle=True, **kwargs) val_iter = torch.utils.data.DataLoader(val, batch_size=mb_size, shuffle=True, **kwargs) embeddings = {} reverse_embeddings = {} onehot_embedding_weights = {} onehot_embedding_spread = {} for k in onehot_cols: dim = input_dict['onehot'][k] onehot_embedding_weights[k] = net.get_embedding_weight(len(unique_onehotvals[k]), dim, use_cuda=use_cuda) embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k]) reverse_embeddings[k] = net.EmbeddingToIndex(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k]) if text_dim > 0: text_embedding_weights = net.get_embedding_weight(len(charcounts) + 1, text_dim, use_cuda=use_cuda) text_embedding = nn.Embedding(len(charcounts) + 1, text_dim, _weight=text_embedding_weights) text_embeddingtoindex = net.EmbeddingToIndex(len(charcounts) + 1, text_dim, _weight=text_embedding_weights) for k in text_cols: embeddings[k] = text_embedding reverse_embeddings[k] = text_embeddingtoindex enc = net.Encoder(input_dict, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) dec = net.Decoder(input_dict, maxlens, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) if use_cuda: embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()} enc.cuda() dec.cuda() #print(enc.parameters) #print(dec.parameters) #contrastivec = contrastive.ContrastiveLoss(margin=margin) logloss = contrastive.GaussianOverlap() #solver = optim.RMSprop([p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr) #solver = optim.Adam( # [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in # dec.parameters()], # lr=lr) solver = optim.RMSprop( [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr, momentum=momentum) Tsample = next(es_iter.__iter__()) if use_cuda: Tsample = {col: Variable(tt).cuda() for col, tt in Tsample.items()} else: Tsample = {col: Variable(tt) for col, tt in Tsample.items()} print({col: tt[0] for col, tt in Tsample.items()}) print('starting training') loss = 0.0 loss0 = 0.0 loss1 = 0.0 loss2 = 0.0 loss3 = 0.0 logger_df = pd.DataFrame(columns=['iter', 'train_loss', 'train_veclen', 'es_veclen', 'Survived_correct', 'Survived_false']) for it in range(n_batches): # X = Variable(torch.tensor(np.array([[1,2,4], [4,1,9]]))).cuda() T = next(iter(train_iter)) #for col, val in T.items(): # T[col] = torch.cat((val, val, val, val), 0) T, X, X2, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d = calc_mus(T, embeddings, reverse_embeddings, enc, dec) enc_loss, enc_loss0, enc_loss1, enc_loss2, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logloss) enc_loss.backward() solver.step() enc.zero_grad() dec.zero_grad() for col in embeddings.keys(): embeddings[col].zero_grad() loss += enc_loss.data.cpu().numpy() loss0 += enc_loss0.data.cpu().numpy() loss1 += enc_loss1.data.cpu().numpy() loss2 += enc_loss2.data.cpu().numpy() loss3 += enc_loss3.data.cpu().numpy() veclen = torch.mean(torch.pow(mu, 2)) if it % epoch_len == 0: print(it, loss/epoch_len, loss0/epoch_len, loss1/epoch_len, loss2/epoch_len, loss3/epoch_len, veclen.data.cpu().numpy()) #enc_loss.data.cpu().numpy(), if use_cuda: mu = torch.zeros(len(train), mu.size(1)).cuda() logvar = torch.zeros(len(train), mu.size(1)).cuda() mu2 = torch.zeros(len(train), mu.size(1)).cuda() mu2d = torch.zeros(len(train), mu.size(1)).cuda() mu_tm = torch.zeros((len(train),) + mu_tm.size()[1:]).cuda() logvar2 = torch.zeros(len(train), mu.size(1)).cuda() logvar2d = torch.zeros(len(train), mu.size(1)).cuda() else: mu = torch.zeros(len(train), mu.size(1)) logvar = torch.zeros(len(train), mu.size(1)) mu2 = torch.zeros(len(train), mu.size(1)) mu2d = torch.zeros(len(train), mu.size(1)) mu_tm = torch.zeros((len(train),) + mu_tm.size()[1:]) logvar2 = torch.zeros(len(train), mu.size(1)) logvar2d = torch.zeros(len(train), mu.size(1)) s = 0 for T0 in train_iter_unshuffled: e = s + T0[to_predict[0]].size(0) if s == 0: T = {col : torch.zeros((len(train),) + val.size()[1:], dtype=val.dtype) for col, val in T0.items()} T0, blah, bblah, mu[s:e], logvar[s:e], mu2[s:e], mu2d[s:e], mu_tm[s:e], logvar2[s:e], logvar2d[s:e] = calc_mus(T0, embeddings, reverse_embeddings, enc, dec, mode='val') for col, val in T0.items(): T[col][s:e] = T0[col] s = e enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logloss, lookfordups=False) vl = torch.mean(torch.pow(mu, 2)) print(f'train enc loss {enc_loss}') print(f'train veclen {vl}') print(f'mean train logvar {torch.mean(logvar)}') logger_df.loc[int(it/epoch_len), ['iter', 'train_loss', 'train_veclen']] = [it, enc_loss.data.cpu().numpy(), vl.data.cpu().numpy()] if use_cuda: mu = torch.zeros(len(es), mu.size(1)).cuda() logvar = torch.zeros(len(es), mu.size(1)).cuda() mu2 = torch.zeros(len(es), mu.size(1)).cuda() mu2d = torch.zeros(len(es), mu.size(1)).cuda() else: mu = torch.zeros(len(es), mu.size(1)) logvar = torch.zeros(len(es), mu.size(1)) mu2 = torch.zeros(len(es), mu.size(1)) mu2d = torch.zeros(len(es), mu.size(1)) s = 0 targets = {} for T0 in es_iter: e = s + T0[to_predict[0]].size(0) if s == 0: T = {col : torch.zeros((len(es),) + val.size()[1:], dtype=val.dtype) for col, val in T0.items()} correct = {col: np.zeros((len(es),) + val.size()[1:]) for col, val in T0.items()} actual = {col: np.zeros((len(es),) + val.size()[1:]) for col, val in T0.items()} Xsample = {} for col, tt in T0.items(): if use_cuda: tt = Variable(tt).cuda() else: tt = Variable(tt) if col in embeddings.keys(): Xsample[col] = embeddings[col](tt) else: Xsample[col] = tt.float() for col in to_predict: targets[col] = tt Xsample[col] = 0.0 * Xsample[col] mu[s:e], logvar[s:e] = enc(Xsample) X2sample = dec(mu[s:e]) T2sample = discretize(X2sample, embeddings, maxlens) mu2[s:e], _ = enc(X2sample) T2 = {} X2dsample = {col: (1.0 * tt).detach() for col, tt in X2sample.items()} for col in continuous_cols: if col in to_predict: correct[col][s:e] = np.abs(X2sample[col].data.cpu().numpy().reshape(-1) - targets[ col].data.cpu().numpy().reshape(-1)) actual[col][s:e] = targets[col].data.cpu().numpy().reshape(-1) else: correct[col][s:e] = np.abs(X2sample[col].data.cpu().numpy().reshape(-1) - T0[ col].data.cpu().numpy().reshape(-1)) actual[col][s:e] = T0[col].data.cpu().numpy().reshape(-1) for col, embedding in embeddings.items(): # T2[col] = reverse_embeddings[col](X2sample[col]) X2dsample[col] = embeddings[col](T2sample[col].detach()) if col in to_predict: correct[col][s:e] = np.abs(T2sample[col].data.cpu().numpy() == targets[col].data.cpu().numpy()) actual[col][s:e] = targets[col].data.cpu().numpy().reshape(-1) else: correct[col][s:e] = np.abs(T2sample[col].data.cpu().numpy() == T0[col].data.cpu().numpy()) actual[col][s:e] = T0[col].data.cpu().numpy().reshape(-1) mu2d[s:e], _ = enc(X2dsample) s = e #enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses(T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logloss, lookfordups=False) #print(f'es enc loss {enc_loss}') vl = torch.mean(torch.pow(mu, 2)) print(f'es veclen {vl}') print(f'mean es logvar {torch.mean(logvar)}') logger_df.loc[int(it/epoch_len), ['es_veclen', 'Survived_correct', 'Survived_false']] = vl.data.cpu().numpy(), np.mean(correct['Survived']), np.mean(actual['Survived']==0) for col in continuous_cols: #print(np.abs(T0[col].data.cpu().numpy().reshape(-1) - T2sample[col].data.cpu().numpy().reshape(-1))) print(f'% {col} mae: {np.mean(correct[col])}') for col in onehot_cols: print(f'% {col} correct: {np.mean(correct[col])} {np.mean(actual[col]==0)}') ''' for col in continuous_cols: mae = np.mean(np.abs(X[col].data.cpu().numpy() - X2[col].data.cpu().numpy())) mse = np.mean(np.square(X[col].data.cpu().numpy() - X2[col].data.cpu().numpy())) print(f'train mae, mse {col} {mae} {mse}') mae = np.mean(np.abs(Xsample[col].data.cpu().numpy() - X2sample[col].data.cpu().numpy())) mse = np.mean(np.square(Xsample[col].data.cpu().numpy() - X2sample[col].data.cpu().numpy())) print(f'val mae, mse {col} {mae} {mse}') print({col: tt[0:2].data.cpu().numpy() for col, tt in T2sample.items()}) if 'Survived' in onehot_cols: print('% survived correct: ', np.mean(T2sample['Survived'].data.cpu().numpy()==Tsample['Survived'].data.cpu().numpy()), np.mean(Tsample['Survived'].data.cpu().numpy()==np.ones_like(Tsample['Survived'].data.cpu().numpy()))) if 'Sex' in onehot_cols: print('% sex correct: ', np.mean(T2sample['Sex'].data.cpu().numpy()==Tsample['Sex'].data.cpu().numpy()), np.mean(Tsample['Sex'].data.cpu().numpy()==np.ones_like(Tsample['Sex'].data.cpu().numpy()))) if 'Embarked' in onehot_cols: print('% Embarked correct: ', np.mean(T2sample['Embarked'].data.cpu().numpy()==Tsample['Embarked'].data.cpu().numpy()) ) print(onehot_embedding_weights['Embarked']) if 'Pclass' in onehot_cols: print('% Pclass correct: ', np.mean(T2sample['Pclass'].data.cpu().numpy() == Tsample['Pclass'].data.cpu().numpy())) if 'Cabin' in text_cols: print(embeddings['Cabin'].weight[data.charindex['1']]) if 'Pclass' in onehot_cols: diff = torch.mean(torch.pow(embeddings['Pclass'].weight - reverse_embeddings['Pclass'].weight, 2)).data.cpu().numpy() print(f'diff plcass emb and reverse_emb: {diff}') print(embeddings['Pclass'].weight.data.cpu().numpy()) ''' loss = 0.0 loss0 = 0.0 loss1 = 0.0 loss2 = 0.0 loss3 = 0.0 #print(T2.data.cpu()[0, 0:30].numpy()) logger_df.to_csv('logger_'+str(fold)+'.csv', index=False)
def do_train(rawdata, charcounts, maxlens, unique_onehotvals): train_f_labeled = 0.2 n_batches = 2800 mb_size = 128 lr = 2.0e-4 momentum = 0.5 cnt = 0 latent_dim = 32 # 24# recurrent_hidden_size = 24 epoch_len = 8 max_veclen = 0.0 patience = 12 * epoch_len patience_duration = 0 input_dict = {} input_dict['discrete'] = discrete_cols input_dict['continuous'] = continuous_cols input_dict['onehot'] = {} for k in onehot_cols: dim = int(np.ceil(np.log(len(unique_onehotvals[k])) / np.log(2.0))) input_dict['onehot'][k] = dim if len(charcounts) > 0: text_dim = int(np.ceil(np.log(len(charcounts)) / np.log(2.0))) input_dict['text'] = {t: text_dim for t in text_cols} else: text_dim = 0 input_dict['text'] = {} #data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens) #data_idx = np.arange(data.__len__()) data_idx = np.arange(rawdata.shape[0]) np.random.shuffle(data_idx) n_folds = 6 fold_size = 1.0 * rawdata.shape[0] / n_folds #data.__len__() / n_folds folds = [ data_idx[int(i * fold_size):int((i + 1) * fold_size)] for i in range(6) ] fold_groups = {} fold_groups[0] = {'train': [0, 1, 2, 3], 'val': [4]} fold_groups[1] = {'train': [1, 2, 3, 4], 'val': [0]} fold_groups[2] = {'train': [0, 2, 3, 4], 'val': [1]} fold_groups[3] = {'train': [0, 1, 3, 4], 'val': [2]} fold_groups[4] = {'train': [0, 1, 2, 4], 'val': [3]} for fold in range(1): train_idx = np.array( list( itertools.chain.from_iterable( [folds[i] for i in fold_groups[fold]['train']]))) val_idx = np.array( list( itertools.chain.from_iterable( [folds[i] for i in fold_groups[fold]['val']]))) np.random.shuffle(train_idx) train_labeled_idx = train_idx[0:int(train_f_labeled * len(train_idx))] train_unlabed_idx = train_idx[int(train_f_labeled * len(train_idx)):] data = Dataseq(rawdata, charcounts, input_dict, unique_onehotvals, maxlens) train = Subset(data, train_idx) val = Subset(data, val_idx) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_iter = torch.utils.data.DataLoader(train, batch_size=int(mb_size / 1), shuffle=True, **kwargs) train_iter_unshuffled = torch.utils.data.DataLoader(train, batch_size=mb_size, shuffle=False, **kwargs) val_iter = torch.utils.data.DataLoader(val, batch_size=mb_size, shuffle=False, **kwargs) embeddings = {} reverse_embeddings = {} onehot_embedding_weights = {} for k in onehot_cols: dim = input_dict['onehot'][k] onehot_embedding_weights[k] = net.get_embedding_weight( len(unique_onehotvals[k]), dim, use_cuda=use_cuda) embeddings[k] = nn.Embedding(len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k]) reverse_embeddings[k] = net.EmbeddingToIndex( len(unique_onehotvals[k]), dim, _weight=onehot_embedding_weights[k]) if text_dim > 0: text_embedding_weights = net.get_embedding_weight( len(charcounts) + 1, text_dim, use_cuda=use_cuda) text_embedding = nn.Embedding(len(charcounts) + 1, text_dim, _weight=text_embedding_weights) text_embeddingtoindex = net.EmbeddingToIndex( len(charcounts) + 1, text_dim, _weight=text_embedding_weights) for k in text_cols: embeddings[k] = text_embedding reverse_embeddings[k] = text_embeddingtoindex enc = net.Encoder(input_dict, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) dec = net.Decoder(input_dict, maxlens, dim=latent_dim, recurrent_hidden_size=recurrent_hidden_size) if use_cuda: embeddings = {k: embeddings[k].cuda() for k in embeddings.keys()} enc.cuda() dec.cuda() logloss = contrastive.GaussianOverlap() solver = optim.RMSprop( [p for em in embeddings.values() for p in em.parameters()] + [p for p in enc.parameters()] + [p for p in dec.parameters()], lr=lr, momentum=momentum) print('starting training') loss = 0.0 loss0 = 0.0 loss1 = 0.0 loss2 = 0.0 loss3 = 0.0 logger_df = pd.DataFrame(columns=[ 'iter', 'train_loss', 'train_veclen', 'val_veclen', 'val_loss', 'val_acc' ] + [t + '_correct' for t in to_predict] + [t + '_false' for t in to_predict]) for it in range(n_batches): T = next(iter(train_iter)) # for col, value in T.items(): # T[col] = torch.cat((value, value, value, value), 0) T, X, X2, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logvar_tm = calc_mus( T, embeddings, reverse_embeddings, enc, dec) enc_loss, enc_loss0, enc_loss1, enc_loss2, enc_loss3 = calc_losses( T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logvar_tm, logloss) enc_loss.backward() solver.step() enc.zero_grad() dec.zero_grad() for col in embeddings.keys(): embeddings[col].zero_grad() loss += enc_loss.data.cpu().numpy() loss0 += enc_loss0.data.cpu().numpy() loss1 += enc_loss1.data.cpu().numpy() loss2 += enc_loss2.data.cpu().numpy() loss3 += enc_loss3.data.cpu().numpy() veclen = torch.mean(torch.pow(mu, 2)) if it % epoch_len == 0: print( it, loss / epoch_len, loss0 / epoch_len, loss1 / epoch_len, loss2 / epoch_len, loss3 / epoch_len, veclen.data.cpu().numpy()) # enc_loss.data.cpu().numpy(), n_targetvals = embeddings[to_predict[0]].weight.size(0) if use_cuda: mu = torch.zeros(len(train), mu.size(1)).cuda() logvar = torch.zeros(len(train), mu.size(1)).cuda() mu2 = torch.zeros(len(train), mu.size(1)).cuda() mu2d = torch.zeros(len(train), mu.size(1)).cuda() mu_tm = torch.zeros((len(train), ) + mu_tm.size()[1:]).cuda() logvar2 = torch.zeros(len(train), mu.size(1)).cuda() logvar2d = torch.zeros(len(train), mu.size(1)).cuda() logvar_tm = torch.zeros(len(train), 1 + n_targetvals, mu.size(1)).cuda() train_loss = torch.zeros(len(train)).cuda() else: mu = torch.zeros(len(train), mu.size(1)) logvar = torch.zeros(len(train), mu.size(1)) mu2 = torch.zeros(len(train), mu.size(1)) mu2d = torch.zeros(len(train), mu.size(1)) mu_tm = torch.zeros((len(train), ) + mu_tm.size()[1:]) logvar2 = torch.zeros(len(train), mu.size(1)) logvar2d = torch.zeros(len(train), mu.size(1)) logvar_tm = torch.zeros(len(train), 1 + n_targetvals, mu.size(1)) train_loss = torch.zeros(len(train)) s = 0 for T0 in train_iter_unshuffled: e = s + T0[to_predict[0]].size(0) if s == 0: T = { col: torch.zeros((len(train), ) + value.size()[1:], dtype=value.dtype) for col, value in T0.items() } T0, Xsample, _, mu[s:e], logvar[s:e], mu2[s:e], mu2d[ s:e], mu_tm[s:e], logvar2[s:e], logvar2d[ s:e], logvar_tm[s:e] = calc_mus(T0, embeddings, reverse_embeddings, enc, dec, mode='val') for col, value in T0.items(): T[col][s:e] = T0[col] n_targetvals = embeddings[to_predict[0]].weight.size(0) mu_tm[s:e, 0, :] = 1.0 * mu[s:e] p = torch.zeros((e - s), n_targetvals).cuda() # encodings for all the possible target embedding values for i in range(n_targetvals): if use_cuda: t = { col: Xsample[col] if not col in to_predict else embeddings[col]( i * torch.ones_like(T0[col]).cuda()) for col in Xsample.keys() } mu_tm[s:e, i + 1, :], _ = enc(t) else: mu_tm[s:e, i + 1, :], _ = enc({ col: Xsample[col] if not col in to_predict else embeddings[col](i * torch.ones_like(T0[col])) for col in Xsample.keys() }) diffsquares = torch.sqrt( torch.mean( torch.pow( mu_tm[s:e, 0, :] - mu_tm[s:e, i + 1, :], 2), 1)) p[:, i] = 1.0 - torch.abs(torch.erf(diffsquares / 2.0)) labels = T0[to_predict[0]] target = torch.zeros(e - s, n_targetvals) target[torch.arange(e - s), labels] = 1 target = target.cuda() #print(target[0:5]) #print(p[0:5]) p = p / torch.sum(p, 1).view(-1, 1).repeat(1, n_targetvals) train_loss[s:e] += -torch.mean( target * torch.log(torch.clamp(p, 1e-8, 1.0)) + (1 - target) * torch.log(torch.clamp(1 - p, 1e-8, 1.0)), 1) s = e enc_loss, enc_loss0, enc_loss1, enc_loss3, enc_loss3 = calc_losses( T, embeddings, mu, logvar, mu2, mu2d, mu_tm, logvar2, logvar2d, logvar_tm, logloss, lookfordups=False) vl = torch.mean(torch.pow(mu, 2)) print(f'train enc loss {enc_loss}') print(f'train veclen {vl}') print(f'mean train logvar {torch.mean(logvar)}') print(f'mean train_loss {torch.mean(train_loss)}') logger_df.loc[ int(it / epoch_len), ['iter', 'train_loss', 'train_veclen', 'train_loss']] = [ it, enc_loss.data.cpu().numpy(), vl.data.cpu().numpy(), torch.mean(train_loss).data.cpu().numpy() ] if use_cuda: mu = torch.zeros(len(val), mu.size(1)).cuda() logvar = torch.zeros(len(val), mu.size(1)).cuda() mu2 = torch.zeros(len(val), mu.size(1)).cuda() mu2d = torch.zeros(len(val), mu.size(1)).cuda() n_targetvals = embeddings[to_predict[0]].weight.size(0) mu_tm = torch.zeros(len(val), 1 + n_targetvals, mu.size(1)).cuda() val_loss = torch.zeros(len(val)).cuda() val_accuracy = torch.zeros(len(val)).cuda() else: mu = torch.zeros(len(val), mu.size(1)) logvar = torch.zeros(len(val), mu.size(1)) mu2 = torch.zeros(len(val), mu.size(1)) mu2d = torch.zeros(len(val), mu.size(1)) n_targetvals = embeddings[to_predict[0]].weight.size(0) mu_tm = torch.zeros(len(val), 1 + n_targetvals, mu.size(1)) val_loss = torch.zeros(len(val)) val_accuracy = torch.zeros(len(val)) s = 0 targets = {} for T0 in val_iter: e = s + T0[to_predict[0]].size(0) print(s, e) if s == 0: correct = { col: np.zeros((len(val), ) + v.size()[1:]) for col, v in T0.items() } actual = { col: np.zeros((len(val), ) + v.size()[1:]) for col, v in T0.items() } Xsample = {} for col, tt in T0.items(): if use_cuda: tt = Variable(tt).cuda() else: tt = Variable(tt) if col in embeddings.keys(): Xsample[col] = embeddings[col](tt) else: Xsample[col] = tt.float() if col in to_predict: targets[col] = tt Xsample[col] = 0.0 * Xsample[col] mu[s:e], logvar[s:e] = enc(Xsample) X2sample = dec(mu[s:e]) T2sample = discretize(X2sample, embeddings, maxlens) mu2[s:e], _ = enc(X2sample) X2dsample = { col: (1.0 * tt).detach() for col, tt in X2sample.items() } for col in continuous_cols: if col in to_predict: correct[col][s:e] = np.abs( X2sample[col].data.cpu().numpy().reshape(-1) - targets[col].data.cpu().numpy().reshape(-1)) actual[col][s:e] = targets[col].data.cpu().numpy( ).reshape(-1) else: correct[col][s:e] = np.abs( X2sample[col].data.cpu().numpy().reshape(-1) - T0[col].data.cpu().numpy().reshape(-1)) actual[col][s:e] = T0[col].data.cpu().numpy( ).reshape(-1) for col, embedding in embeddings.items(): # T2[col] = reverse_embeddings[col](X2sample[col]) X2dsample[col] = embeddings[col]( T2sample[col].detach()) if col in to_predict: correct[col][s:e] = np.abs(T2sample[col].data.cpu( ).numpy() == targets[col].data.cpu().numpy()) actual[col][s:e] = targets[col].data.cpu().numpy( ).reshape(-1) else: correct[col][s:e] = np.abs(T2sample[col].data.cpu( ).numpy() == T0[col].data.cpu().numpy()) actual[col][s:e] = T0[col].data.cpu().numpy( ).reshape(-1) mu2d[s:e], _ = enc(X2dsample) ''' calculate target predictions for validation data ''' n_targetvals = embeddings[to_predict[0]].weight.size(0) mu_tm[s:e, 0, :] = 1.0 * mu[s:e] if use_cuda: p = torch.zeros((e - s), n_targetvals).cuda() else: p = torch.zeros((e - s), n_targetvals) # generate encodings for all the possible target embedding values for i in range(n_targetvals): if use_cuda: t = { col: Xsample[col] if not col in to_predict else embeddings[col]( i * torch.ones_like(T0[col]).cuda()) for col in Xsample.keys() } mu_tm[s:e, i + 1, :], _ = enc(t) else: mu_tm[s:e, i + 1, :], _ = enc({ col: Xsample[col] if not col in to_predict else embeddings[col](i * torch.ones_like(T0[col])) for col in Xsample.keys() }) diffsquares = torch.sqrt( torch.mean( torch.pow( mu_tm[s:e, 0, :] - mu_tm[s:e, i + 1, :], 2), 1)) p[:, i] = 1.0 - torch.abs(torch.erf(diffsquares / 2.0)) #print(mu_tm[s:s+5, i + 1, 0:5]) print(diffsquares[0:5]) labels = T0[to_predict[0]] target = torch.zeros(e - s, n_targetvals) target[torch.arange(e - s), labels] = 1 if use_cuda: target = target.cuda() labels = labels.cuda() p = p / torch.sum(p, 1).view(-1, 1).repeat(1, n_targetvals) val_accuracy[s:e] = torch.eq(labels, torch.max(p, 1)[1]).float() val_loss[s:e] += -torch.mean( target * torch.log(torch.clamp(p, 1e-8, 1.0)) + (1 - target) * torch.log(torch.clamp(1 - p, 1e-8, 1.0)), 1) s = e vl = torch.mean(torch.pow(mu, 2)) print(f'val veclen {vl}') print(f'mean es logvar {torch.mean(logvar)}') print(f'mean val_loss {torch.mean(val_loss)}') print(f'mean val_accuracy {torch.mean(val_accuracy)}') logger_df.loc[ int(it / epoch_len), ['val_veclen', 'val_loss', 'val_acc']] = vl.data.cpu( ).numpy(), torch.mean(val_loss).data.cpu().numpy( ), torch.mean(val_accuracy).data.cpu().numpy() for target_col in to_predict: logger_df.loc[ int(it / epoch_len), [target_col + '_correct', target_col + '_false']] = np.mean(correct[target_col]), np.mean( actual[target_col] == 0) for col in continuous_cols: # print(np.abs(T0[col].data.cpu().numpy().reshape(-1) - T2sample[col].data.cpu().numpy().reshape(-1))) print(f'% {col} mae: {np.mean(correct[col])}') for col in onehot_cols: print( f'% {col} correct: {np.mean(correct[col])} {np.mean(actual[col]==0)}' ) loss = 0.0 loss0 = 0.0 loss1 = 0.0 loss2 = 0.0 loss3 = 0.0 # print(T2.data.cpu()[0, 0:30].numpy()) logger_df.to_csv('logger_' + str(fold) + '.csv', index=False)