shuffle=False)

    deep_model_params = {
        'deep_columns_idx': deep_columns_idx,
        'embedding_columns_dict': embedding_columns_dict,
        'hidden_size_list': args.hidden_size_list,
        'dropouts': args.dropouts,
        'deep_output_dim': args.deep_out_dim
    }
    wide_model_params = {
        'wide_input_dim': data_wide.shape[1],
        'wide_output_dim': args.wide_out_dim
    }
    widedeep = WideDeep(wide_model_params, deep_model_params)
    model = widedeep.to(device)
    optimizer = torch.optim.Adam(widedeep.parameters(), lr=args.lr)

    for epoch in range(args.epochs):
        model.train()
        for idx, (data_wide, data_deep, target) in enumerate(trainloader):
            data_wide, data_deep, target = data_wide.to(device), data_deep.to(
                device), target.to(device)
            x = (data_wide, data_deep)
            optimizer.zero_grad()
            out = model(x)
            loss = F.binary_cross_entropy(out, target.float())
            print('epoch:{}, step:{}, loss:{:.10f}'.format(epoch, idx, loss))
            loss.backward()
            optimizer.step()
            if idx == len(trainloader):
                break
Beispiel #2
0
def main():
    data = pd.read_csv(RATING_FILE_PATH_TRAIN)
    batch_size = 128
    max_seq_len = 50
    sparse_features = ['user_id', 'movie_id', 'gender', 'occupation', 'zip']
    dense_features = ['age']
    print(data.head(10))

    feature_max_id = {}
    for feature in sparse_features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature]) + 1
        feature_max_id[feature] = data[feature].max() + 1

    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # define features
    user_sparse_features = ["user_id", "gender", "occupation", "zip"]
    user_dense_features = ["age"]

    item_sparse_features = ["movie_id"]

    user_profile = data[user_sparse_features +
                        user_dense_features].drop_duplicates('user_id')
    item_profile = data[item_sparse_features].drop_duplicates('movie_id')
    user_profile.set_index("user_id", drop=False, inplace=True)

    print("Generate train and test dataset...")
    train_set, test_set = generate_train_test_dataset(data)

    print("Generate train and test features...")
    train_dataloader = generate_feature(train_set, user_profile, item_profile,
                                        batch_size, max_seq_len)
    test_dataloader = generate_feature(test_set, user_profile, item_profile,
                                       batch_size, max_seq_len)

    print("Generate feature columns...")
    embedding_dim = 8
    user_feature_columns = [SparseFeat(feat, feature_max_id[feat], embedding_dim) for i, feat in enumerate(user_sparse_features)] \
        + [DenseFeat(feat, 1) for i, feat in enumerate(user_dense_features)] \
        + [SeqSparseFeat(SparseFeat('user_hist', feature_max_id['movie_id'], embedding_dim, embedding_name='movie_id'),
                         maxlen=max_seq_len,combiner='mean', length_name=None)]

    item_feature_columns = [
        SparseFeat(feat, feature_max_id[feat], embedding_dim)
        for i, feat in enumerate(item_sparse_features)
    ]

    # define model
    model = WideDeep(feature_columns=user_feature_columns +
                     item_feature_columns)

    loss_func = nn.BCELoss()
    optimizer = torch.optim.Adagrad(params=model.parameters(), lr=0.01)
    metric_func = auc
    metric_name = 'auc'
    epochs = 3
    log_step_freq = 1000

    print('start_training.........')
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('========' * 8 + '%s' % nowtime)

    for epoch in range(1, epochs + 1):
        model.train()
        loss_sum = 0.0
        metric_sum = 0.0
        step = 1

        for step, (features, labels) in enumerate(train_dataloader, 1):
            optimizer.zero_grad()

            predictions = model(features)
            loss = loss_func(predictions, labels)
            metric = metric_func(predictions, labels)

            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            metric_sum += metric.item()
            if step % log_step_freq == 0:
                print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") %
                      (step, loss_sum / step, metric_sum / step))

        model.eval()
        val_loss_sum = 0.0
        val_metric_sum = 0.0

        for val_step, (features, labels) in enumerate(test_dataloader, 1):
            with torch.no_grad():
                predictions = model(features)
                val_loss = loss_func(predictions, labels)
                val_metric = metric_func(predictions, labels)
            val_loss_sum += val_loss.item()
            val_metric_sum += val_metric.item()

        info = (epoch, loss_sum / step, metric_sum / step)
        print(("\nEPOCH=%d, val_loss=%.3f, " + "val_auc" + " = %.3f") % info)
        nowtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print('\n' + '==========' * 8 + '%s' % nowtime)