def data_loader(dat, bsz): # columns used to train train_data, test_data = _data.data() x = torch.from_numpy(dat).double() if len(dat) > 10000: y = torch.LongTensor( train_data.as_matrix(["diabetes"]).reshape(-1).tolist()) else: y = torch.LongTensor( test_data.as_matrix(["diabetes"]).reshape(-1).tolist()) data_set = torch.utils.data.TensorDataset(x, y) return torch.utils.data.DataLoader(data_set, batch_size=bsz, shuffle=True)
import _data import cat from sklearn.metrics import roc_auc_score import catboost train_data, test_data = _data.data() feature_score = cat.feature_score() new_cols = feature_score.iloc[0:18, :]["Feature"] cat_feature_inds = [] for i, c in enumerate(train_data[new_cols].columns.values): num_uniques = len(train_data[new_cols][c].unique()) if num_uniques < 5: cat_feature_inds.append(i) cat_model = catboost.CatBoostClassifier(iterations=400, learning_rate=0.03, depth=6, l2_leaf_reg=1, eval_metric='F1', random_seed=4 * 100 + 6) cat_model.fit(train_data[new_cols], train_data.diabetes, cat_features=cat_feature_inds) print("The test auc is %.4f" % roc_auc_score(test_data.diabetes, cat_model.predict_proba(test_data[new_cols])[:, 1]))
def features(): train_data, test_data = _data.data() cols = train_data.columns.values.tolist() cols.remove("diabetes") # Hyper Parameters num_epochs = 10 batch_size = 16 learning_rate = 1e-3 USE_CUDA = True net = Net(input_size=39, hs1=748, hs2=256, hs3=64, hs4=16, num_classes=2) if USE_CUDA: try: net = net.cuda() except Exception as e: print(e) USE_CUDA = False # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) # Train the Model for epoch in range(num_epochs): total_loss = 0 for i, (x, labels) in enumerate(train_loader(train_data, batch_size)): # Convert torch tensor to Variable x = Variable(x).float() labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() # zero the gradient buffer outputs = net(x) loss = criterion(outputs, labels) total_loss += float(loss.data[0]) loss.backward() optimizer.step() if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.6f' % (epoch + 1, num_epochs, i + 1, len(train_data) // batch_size, total_loss / (i + 1))) # Test the Model correct = 0 total = 0 net.eval() for x, labels in test_loader(test_data, len(test_data)): x = Variable(x).float() outputs = net(x) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the network on the %d test set: %d %%' % (len(test_data), 100 * correct / total)) # feature map fc1 = np.matmul(train_data.as_matrix(cols), net.fc1._parameters["weight"].detach().numpy().T)+\ net.fc1._parameters["bias"].detach().numpy() fc2 = np.matmul(fc1, net.fc2._parameters["weight"].detach().numpy().T)+\ net.fc2._parameters["bias"].detach().numpy() # 64 new features fc3 = np.matmul(fc2, net.fc3._parameters["weight"].detach().numpy().T)+\ net.fc3._parameters["bias"].detach().numpy() fc4 = np.matmul(fc3, net.fc4._parameters["weight"].detach().numpy().T)+\ net.fc4._parameters["bias"].detach().numpy() fc1_test = np.matmul(test_data.as_matrix(cols), net.fc1._parameters["weight"].detach().numpy().T)+\ net.fc1._parameters["bias"].detach().numpy() fc2_test = np.matmul(fc1_test, net.fc2._parameters["weight"].detach().numpy().T)+\ net.fc2._parameters["bias"].detach().numpy() fc3_test = np.matmul(fc2_test, net.fc3._parameters["weight"].detach().numpy().T)+\ net.fc3._parameters["bias"].detach().numpy() fc4_test = np.matmul(fc3_test, net.fc4._parameters["weight"].detach().numpy().T)+\ net.fc4._parameters["bias"].detach().numpy() return fc1, fc1_test, fc2, fc2_test, fc3, fc3_test, fc4, fc4_test
def feature_score(): train_data, test_data = _data.data() # =================== train ============================ # # columns used to train cols = train_data.columns.values.tolist() # remove label cols.remove("diabetes") cat_feature_inds = [] for i, c in enumerate(train_data[cols].columns.values): num_uniques = len(train_data[cols][c].unique()) if num_uniques < 5: cat_feature_inds.append(i) print("CV 5-fold train begin...") t0 = time.time() kf = KFold(n_splits=5, shuffle=True, random_state=2018) scores = [] for i, (train_idx, val_idx) in enumerate(kf.split(train_data)): print("The {0} round train...".format(i + 1)) cat_model = catboost.CatBoostClassifier( iterations=400, learning_rate=0.03, depth=6, l2_leaf_reg=1, eval_metric='F1', random_seed=i * 100 + 6, logging_level="Silent" ) train_feat1 = train_data[cols].iloc[train_idx, :] train_feat2 = train_data[cols].iloc[val_idx, :] train_target1 = train_data.diabetes.iloc[train_idx] train_target2 = train_data.diabetes.iloc[val_idx] cat_model.fit(train_feat1, train_target1, cat_features=cat_feature_inds) print('Train auc', roc_auc_score(train_target1, cat_model.predict_proba(train_feat1)[:, 1])) print('Test auc', roc_auc_score(train_target2, cat_model.predict_proba(train_feat2)[:, 1])) scores.append(roc_auc_score(train_target2, cat_model.predict_proba(train_feat2)[:, 1])) print("The average test auc is {0}".format(np.mean(scores))) cat_model = catboost.CatBoostClassifier( iterations=400, learning_rate=0.03, depth=6, l2_leaf_reg=1, eval_metric='F1', random_seed=4 * 100 + 6) cat_model.fit(train_data[cols], train_data.diabetes, cat_features=cat_feature_inds) print("The test auc is %.4f"% roc_auc_score(test_data.diabetes, cat_model.predict_proba(test_data[cols])[:, 1])) # feature importances feature_score = pd.DataFrame( list(zip(train_data[cols].dtypes.index, cat_model.get_feature_importance(Pool(train_data.as_matrix(cols), label=train_data["diabetes"], cat_features=cat_feature_inds)))), columns=['Feature','Score']) feature_score = feature_score.sort_values( by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last') return feature_score
def features(): train_data, test_data = _data.data() feature_score = cat.feature_score() fc1, fc1_test, _, _, _, _, _, _ = dnn.features() new_cols = feature_score.iloc[0:18, :]["Feature"] trainD = pd.concat([train_data[new_cols], train_data[new_cols]], axis=1).as_matrix() trainD = np.concatenate((trainD, fc1), axis=1) trainD = trainD.reshape((-1, 1, 28, 28)) testD = pd.concat([test_data[new_cols], test_data[new_cols]], axis=1).as_matrix() testD = np.concatenate((testD, fc1_test), axis=1) testD = testD.reshape((-1, 1, 28, 28)) # ====================== CNN ======================== # # Hyper Parameters num_epochs = 3 batch_size = 16 learning_rate = 0.0001 cnn = CNN() cnn.double() # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(cnn.parameters(), lr=learning_rate, momentum=0.9) # Train the Model for epoch in range(num_epochs): total_loss = 0 for i, (images, labels) in enumerate(data_loader(trainD, batch_size)): images = images.double() images = Variable(images) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = cnn(images) loss = criterion(outputs, labels) total_loss += loss loss.backward() optimizer.step() if (i + 1) % 100 == 0: print('Epoch [%d/%d], Iter [%d/%d] Loss: %.6f' % (epoch + 1, num_epochs, i + 1, len(train_data) // batch_size, total_loss / (i + 1))) # Test the Model cnn.eval() # Change model to 'eval' mode (BN uses moving mean/var). correct = 0 total = 0 for images, labels in data_loader(testD, len(testD)): images = Variable(images) outputs = cnn(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Test Accuracy of the model on the %d test images: %d %%' % (len(testD), 100 * correct / total)) # conv2 feature map conv1 = cnn.conv1.forward( Variable(next(iter(data_loader(trainD, len(trainD))))[0])).detach().numpy() conv1 = conv1.reshape((-1, 10, 24 * 24)) conv1 = np.mean(conv1, axis=1) conv1_test = cnn.conv1.forward( Variable(next(iter(data_loader(testD, len(testD))))[0])).detach().numpy() conv1_test = conv1_test.reshape((-1, 10, 24 * 24)) conv1_test = np.mean(conv1_test, axis=1) conv2 = cnn.conv2.forward( cnn.conv1.forward( Variable(next(iter(data_loader( trainD, len(trainD))))[0]))).detach().numpy() conv2_test = cnn.conv2.forward( cnn.conv1.forward( Variable(next(iter(data_loader( testD, len(testD))))[0]))).detach().numpy() # calculate mean of all maps conv2 = np.mean(conv2, axis=2).reshape(len(trainD), -1) conv2_test = np.mean(conv2_test, axis=2).reshape(len(testD), -1) return conv1, conv1_test, conv2, conv2_test