예제 #1
0
def data_loader(dat, bsz):
    # columns used to train
    train_data, test_data = _data.data()
    x = torch.from_numpy(dat).double()
    if len(dat) > 10000:
        y = torch.LongTensor(
            train_data.as_matrix(["diabetes"]).reshape(-1).tolist())
    else:
        y = torch.LongTensor(
            test_data.as_matrix(["diabetes"]).reshape(-1).tolist())
    data_set = torch.utils.data.TensorDataset(x, y)

    return torch.utils.data.DataLoader(data_set, batch_size=bsz, shuffle=True)
예제 #2
0
import _data
import cat
from sklearn.metrics import roc_auc_score
import catboost

train_data, test_data = _data.data()
feature_score = cat.feature_score()
new_cols = feature_score.iloc[0:18, :]["Feature"]
cat_feature_inds = []
for i, c in enumerate(train_data[new_cols].columns.values):
    num_uniques = len(train_data[new_cols][c].unique())
    if num_uniques < 5:
        cat_feature_inds.append(i)

cat_model = catboost.CatBoostClassifier(iterations=400,
                                        learning_rate=0.03,
                                        depth=6,
                                        l2_leaf_reg=1,
                                        eval_metric='F1',
                                        random_seed=4 * 100 + 6)

cat_model.fit(train_data[new_cols],
              train_data.diabetes,
              cat_features=cat_feature_inds)

print("The test auc is %.4f" %
      roc_auc_score(test_data.diabetes,
                    cat_model.predict_proba(test_data[new_cols])[:, 1]))
예제 #3
0
def features():
    train_data, test_data = _data.data()
    cols = train_data.columns.values.tolist()
    cols.remove("diabetes")
    # Hyper Parameters
    num_epochs = 10
    batch_size = 16
    learning_rate = 1e-3
    USE_CUDA = True
    net = Net(input_size=39, hs1=748, hs2=256, hs3=64, hs4=16, num_classes=2)

    if USE_CUDA:
        try:
            net = net.cuda()
        except Exception as e:
            print(e)
            USE_CUDA = False
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # Train the Model
    for epoch in range(num_epochs):
        total_loss = 0
        for i, (x, labels) in enumerate(train_loader(train_data, batch_size)):
            # Convert torch tensor to Variable
            x = Variable(x).float()
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = net(x)
            loss = criterion(outputs, labels)
            total_loss += float(loss.data[0])
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.6f' %
                      (epoch + 1, num_epochs, i + 1,
                       len(train_data) // batch_size, total_loss / (i + 1)))

    # Test the Model
    correct = 0
    total = 0
    net.eval()
    for x, labels in test_loader(test_data, len(test_data)):
        x = Variable(x).float()
        outputs = net(x)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy of the network on the %d test set: %d %%' %
          (len(test_data), 100 * correct / total))

    # feature map
    fc1 = np.matmul(train_data.as_matrix(cols), net.fc1._parameters["weight"].detach().numpy().T)+\
        net.fc1._parameters["bias"].detach().numpy()
    fc2 = np.matmul(fc1, net.fc2._parameters["weight"].detach().numpy().T)+\
        net.fc2._parameters["bias"].detach().numpy()
    # 64 new features
    fc3 = np.matmul(fc2, net.fc3._parameters["weight"].detach().numpy().T)+\
        net.fc3._parameters["bias"].detach().numpy()
    fc4 = np.matmul(fc3, net.fc4._parameters["weight"].detach().numpy().T)+\
        net.fc4._parameters["bias"].detach().numpy()

    fc1_test = np.matmul(test_data.as_matrix(cols), net.fc1._parameters["weight"].detach().numpy().T)+\
        net.fc1._parameters["bias"].detach().numpy()
    fc2_test = np.matmul(fc1_test, net.fc2._parameters["weight"].detach().numpy().T)+\
        net.fc2._parameters["bias"].detach().numpy()
    fc3_test = np.matmul(fc2_test, net.fc3._parameters["weight"].detach().numpy().T)+\
        net.fc3._parameters["bias"].detach().numpy()
    fc4_test = np.matmul(fc3_test, net.fc4._parameters["weight"].detach().numpy().T)+\
        net.fc4._parameters["bias"].detach().numpy()

    return fc1, fc1_test, fc2, fc2_test, fc3, fc3_test, fc4, fc4_test
예제 #4
0
def feature_score():
    train_data, test_data = _data.data()

    # =================== train ============================ #
    # columns used to train
    cols = train_data.columns.values.tolist()
    # remove label
    cols.remove("diabetes")

    cat_feature_inds = []
    for i, c in enumerate(train_data[cols].columns.values):
        num_uniques = len(train_data[cols][c].unique())
        if num_uniques < 5:
            cat_feature_inds.append(i)

    print("CV 5-fold train begin...")
    t0 = time.time()
    kf = KFold(n_splits=5, shuffle=True, random_state=2018)
    scores = []
    for i, (train_idx, val_idx) in enumerate(kf.split(train_data)):
        print("The {0} round train...".format(i + 1))
        cat_model = catboost.CatBoostClassifier(
            iterations=400,
            learning_rate=0.03,
            depth=6,
            l2_leaf_reg=1,
            eval_metric='F1',
            random_seed=i * 100 + 6,
            logging_level="Silent"
        )
        train_feat1 = train_data[cols].iloc[train_idx, :]
        train_feat2 = train_data[cols].iloc[val_idx, :]
        train_target1 = train_data.diabetes.iloc[train_idx]
        train_target2 = train_data.diabetes.iloc[val_idx]
        cat_model.fit(train_feat1, train_target1, cat_features=cat_feature_inds)
        print('Train auc', roc_auc_score(train_target1, cat_model.predict_proba(train_feat1)[:, 1]))
        print('Test auc', roc_auc_score(train_target2, cat_model.predict_proba(train_feat2)[:, 1]))
        scores.append(roc_auc_score(train_target2, cat_model.predict_proba(train_feat2)[:, 1]))

    print("The average test auc is {0}".format(np.mean(scores)))

    cat_model = catboost.CatBoostClassifier(
            iterations=400,
            learning_rate=0.03,
            depth=6,
            l2_leaf_reg=1,
            eval_metric='F1',
            random_seed=4 * 100 + 6)

    cat_model.fit(train_data[cols], train_data.diabetes, cat_features=cat_feature_inds)

    print("The test auc is %.4f"% roc_auc_score(test_data.diabetes, cat_model.predict_proba(test_data[cols])[:, 1]))

    # feature importances
    feature_score = pd.DataFrame(
        list(zip(train_data[cols].dtypes.index,
                 cat_model.get_feature_importance(Pool(train_data.as_matrix(cols),
                                                       label=train_data["diabetes"],
                                                       cat_features=cat_feature_inds)))),
        columns=['Feature','Score'])
    feature_score = feature_score.sort_values(
        by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')

    return feature_score
예제 #5
0
def features():
    train_data, test_data = _data.data()
    feature_score = cat.feature_score()
    fc1, fc1_test, _, _, _, _, _, _ = dnn.features()
    new_cols = feature_score.iloc[0:18, :]["Feature"]
    trainD = pd.concat([train_data[new_cols], train_data[new_cols]],
                       axis=1).as_matrix()
    trainD = np.concatenate((trainD, fc1), axis=1)
    trainD = trainD.reshape((-1, 1, 28, 28))

    testD = pd.concat([test_data[new_cols], test_data[new_cols]],
                      axis=1).as_matrix()
    testD = np.concatenate((testD, fc1_test), axis=1)
    testD = testD.reshape((-1, 1, 28, 28))
    # ====================== CNN ======================== #
    # Hyper Parameters
    num_epochs = 3
    batch_size = 16
    learning_rate = 0.0001

    cnn = CNN()
    cnn.double()
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(cnn.parameters(),
                                lr=learning_rate,
                                momentum=0.9)
    # Train the Model
    for epoch in range(num_epochs):
        total_loss = 0
        for i, (images, labels) in enumerate(data_loader(trainD, batch_size)):
            images = images.double()
            images = Variable(images)
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = cnn(images)
            loss = criterion(outputs, labels)
            total_loss += loss
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print('Epoch [%d/%d], Iter [%d/%d] Loss: %.6f' %
                      (epoch + 1, num_epochs, i + 1,
                       len(train_data) // batch_size, total_loss / (i + 1)))

    # Test the Model
    cnn.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
    correct = 0
    total = 0

    for images, labels in data_loader(testD, len(testD)):
        images = Variable(images)
        outputs = cnn(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Test Accuracy of the model on the %d test images: %d %%' %
          (len(testD), 100 * correct / total))

    # conv2 feature map
    conv1 = cnn.conv1.forward(
        Variable(next(iter(data_loader(trainD,
                                       len(trainD))))[0])).detach().numpy()
    conv1 = conv1.reshape((-1, 10, 24 * 24))
    conv1 = np.mean(conv1, axis=1)

    conv1_test = cnn.conv1.forward(
        Variable(next(iter(data_loader(testD,
                                       len(testD))))[0])).detach().numpy()
    conv1_test = conv1_test.reshape((-1, 10, 24 * 24))
    conv1_test = np.mean(conv1_test, axis=1)

    conv2 = cnn.conv2.forward(
        cnn.conv1.forward(
            Variable(next(iter(data_loader(
                trainD, len(trainD))))[0]))).detach().numpy()

    conv2_test = cnn.conv2.forward(
        cnn.conv1.forward(
            Variable(next(iter(data_loader(
                testD, len(testD))))[0]))).detach().numpy()

    # calculate mean of all maps
    conv2 = np.mean(conv2, axis=2).reshape(len(trainD), -1)
    conv2_test = np.mean(conv2_test, axis=2).reshape(len(testD), -1)

    return conv1, conv1_test, conv2, conv2_test