Example #1
0
def train(train_data,
          val_data,
          user_list_train_filtered,
          user_list_val_filtered,
          user_beta_train,
          user_beta_val,
          k,
          dataset,
          eta=0.1,
          lamb=0.1,
          tolerance=1e-4,
          num_iter_val=5,
          num_total_iter_training=6,
          random_seed=786,
          kU=None,
          cv_flag=True,
          verbose=False):

    np.random.seed(random_seed)

    user_feat = val_data.drop(['user', 'label'], axis=1).values
    user_feat_train = train_data.drop(['user', 'label'], axis=1).values
    w = np.random.normal(0, 1, user_feat.shape[1])

    metrics = Metrics()
    metrics.eta_lr = eta
    metrics.lamb_reg = lamb
    print("running for eta", eta, "and lambda", lamb)

    for i in range(num_total_iter_training):
        grad, loss = subgradient(w, train_data, user_list_train_filtered,
                                 user_beta_train, k)
        grad += lamb * w
        w = w - (eta / np.sqrt(i + 1)) * grad
        metrics.w_list.append(w)
        metrics.loss_opt_list_train.append(loss)

        y_scores = user_feat_train.dot(w)
        data_true = deepcopy(train_data)
        data_true['scores'] = y_scores
        data_true = data_true.sort_values(by='scores', ascending=False)
        data_true = data_true.reset_index(drop=True)
        metrics.micro_auc_rel_k_list_train.append(
            compute_micro(data_true, user_list_train_filtered, user_beta_train,
                          w, k))

        if verbose:
            print('Epoch', i + 1, 'completed out of', num_total_iter_training,
                  'for prec@k loss train:', metrics.loss_opt_list_train[-1])
            print('Epoch', i + 1, 'completed out of', num_total_iter_training,
                  'for prec@k grad train:', np.linalg.norm(grad))

        # evaluate combined weights
        if (cv_flag):
            if i % num_iter_val == 0:
                y_scores = user_feat.dot(w)
                data_true = deepcopy(val_data)
                data_true['scores'] = y_scores
                data_true = data_true.sort_values(by='scores', ascending=False)
                data_true = data_true.reset_index(drop=True)
                metrics.micro_auc_rel_k_list_val.append(
                    compute_micro(data_true, user_list_val_filtered,
                                  user_beta_val, w, k))

                if verbose:
                    print("\n")
                    print('Epoch', i + 1, 'completed out of',
                          num_total_iter_training, 'for prec@k loss val:',
                          metrics.micro_auc_rel_k_list_val[-1])
                    print("\n")

    return metrics, None
Example #2
0
def train(train_data,
          val_data,
          user_list_train_filtered,
          user_list_val_filtered,
          user_beta_train,
          user_beta_val,
          k,
          eta=0.1,
          lamb=0.1,
          num_iter_val=5,
          num_total_iter_training=6,
          n_classifiers=5,
          random_seed=786,
          verbose=False):

    np.random.seed(random_seed)

    user_list_val_filtered = user_list_train_filtered[
        0:int(0.2 * len(user_list_train_filtered))]
    user_list_train_filtered = list(
        set(user_list_train_filtered) - set(user_list_val_filtered))
    val_data = train_data[train_data['user'].isin(user_list_val_filtered)]
    train_data = train_data[train_data['user'].isin(user_list_train_filtered)]

    metrics = Metrics()
    metrics.eta_lr = eta
    metrics.lamb_reg = lamb

    classifier_list = []

    kf = KFold(n_splits=n_classifiers, shuffle=True)
    features = train_data.drop(['user', 'label'], axis=1)
    labels = train_data['label']
    for _, split_indices in kf.split(features):
        split_features = features.iloc[split_indices].values
        split_labels = labels.iloc[split_indices].values
        num_examples = split_features.shape[0]

        w = np.random.normal(0, 1, (split_features.shape[1], ))
        w = w / np.linalg.norm(w)
        for num_iter in np.arange(num_total_iter_training):
            scores = sigmoid(np.dot(split_features, w))
            loss = -1 / num_examples * np.sum(split_labels * np.log(scores) +
                                              (1 - split_labels) *
                                              np.log(1 - scores))
            print("loss is ", loss)
            dLdwx = (scores - split_labels) * scores * (1 - scores)
            grad = 1 / num_examples * np.sum(
                dLdwx.reshape(-1, 1) * split_features)
            grad += lamb * w
            print("grad is ", np.linalg.norm(grad))
            print("\n")
            w = w - (eta / np.sqrt(num_iter + 1)) * grad
        accuracy = np.sum(split_labels * (scores > 0.5) + (1 - split_labels) *
                          (scores < 0.5))
        print('accuracy: {}'.format(accuracy / num_examples))
        classifier_list.append(w)
    print('eta is ', eta, 'and lambda is ', lamb)
    print('\n')

    classifiers_with_metrics = []
    for w in classifier_list:
        user_feat = val_data.drop(['user', 'label'], axis=1).values
        y_scores = user_feat.dot(w)
        data_true = deepcopy(val_data)
        data_true['scores'] = y_scores
        data_true = data_true.sort_values(by='scores', ascending=False)
        data_true = data_true.reset_index(drop=True)
        metric = compute_micro(data_true, user_list_val_filtered,
                               user_beta_train, w, k)
        classifiers_with_metrics.append((metric, w))
    classifiers_with_metrics.sort(reverse=True, key=lambda x: x[0])
    combined_w = classifiers_with_metrics[0][1]
    for _, w in classifiers_with_metrics[1:]:
        combined_w = merge_micro(val_data, combined_w, w,
                                 user_list_val_filtered, user_beta_train, k)

    # create dummy metrics
    # need weights and one validation loss for the "best iter" logic
    metrics = Metrics()
    metrics.w_list.append(combined_w)
    metrics.micro_auc_rel_k_list_val.append(0)
    metrics.micro_auc_rel_k_list_train.append(0)
    metrics.loss_opt_list_train.append(0)
    return metrics, None
def train(
    train_data,
    val_data, 
    user_list_train_filtered,
    user_list_val_filtered,
    surr,
    eta=0.1,
    momentum=0.0,
    lamb=0.1,
    num_iter_val=5,
    tolerance=1e-6,
    num_total_iter_training=151,
    draw=False,
    verbose=True,
    random_seed = 786,
    w=None):
    
    np.random.seed(random_seed)
    metrics = utils.Metrics()

    metrics.eta_lr = eta
    metrics.lamb_reg = lamb

    if w is None:
        w = np.random.normal(0, 1, (train_data.shape[1] - 2, ))
    
    prev_grad_w = np.zeros((train_data.shape[1] - 2, ))

    for num_iter in np.arange(num_total_iter_training):

        tic = time.time()

        metrics.w_list.append(w)

        loss_opt = 0
        k_minus_w_opt = 0
        grad_w = np.zeros((train_data.shape[1] - 2, ))
        
        sorting_time = 0
        surrogate_time = 0
        for user_id in user_list_train_filtered:

            user_df_train = train_data[train_data['user'] == user_id]

            if(len(user_df_train.label.unique()) == 1):
                if(user_df_train.label.iloc[0] == 1.0):
                    loss_opt += 0.0
                else:
                    loss_opt += 1.0
            else:

                sorting_start_time = time.time()
                beta = int(user_beta_train[user_id])

                user_df_pos = user_df_train[user_df_train['label'] == 1]
                user_df_neg = user_df_train[user_df_train['label'] == 0]

                user_feat_pos = user_df_pos.drop(['user', 'label'], axis = 1).values
                user_feat_neg = user_df_neg.drop(['user', 'label'], axis = 1).values

                indices_pos, scores_pos = sort_order(user_feat_pos, w)

                indices_neg, scores_neg = sort_order(user_feat_neg, w)

                sorted_user_feat_pos = user_feat_pos[indices_pos, :] 
                sorted_user_feat_neg = user_feat_neg[indices_neg, :] 
                sorted_scores_pos = scores_pos[indices_pos]
                sorted_scores_neg = scores_neg[indices_neg]
    
                sorting_time += time.time() - sorting_start_time

                surrogate_start_time = time.time()
                pi_opt, score_mat = surr.compute_pi(
                    sorted_scores_pos, sorted_scores_neg,
                    w, k, beta)
                
                loss_opt_user, _, _, _ = surr.loss(
                    pi_opt, sorted_scores_pos, sorted_scores_neg, k, beta)
                
                if draw and user_id == 0:
                    plt.subplot(1,2,1)
                    plt.imshow(score_mat)
                    plt.subplot(1,2,2)
                    plt.imshow(pi_opt)
                    plt.show()

                grad_w_user = surr.gradient(
                    sorted_user_feat_pos, sorted_user_feat_neg, pi_opt, k, beta)
                
                surrogate_time += time.time() - surrogate_start_time

                grad_w_user += lamb*w
                loss_opt += loss_opt_user

                grad_w += grad_w_user

        grad_w = grad_w/len(user_list_train_filtered)

        metrics_start_time = time.time()
                
        # sort data once for both micro
        user_feat = train_data.drop(['user', 'label'], axis = 1).values
        y_scores = user_feat.dot(w)
        data_true = deepcopy(train_data)
        data_true['scores'] = y_scores
        data_true = data_true.sort_values(by='scores', ascending=False)
        data_true = data_true.reset_index(drop=True)

        metrics.grad_w_list.append(np.linalg.norm(grad_w))
        metrics.loss_opt_list_train.append(loss_opt/len(user_list_train_filtered))
        metrics.micro_auc_rel_k_list_train.append(utils.compute_micro(data_true, user_list_train_filtered, user_beta_train, w, k))
        
        if verbose:
            print('    sorting elapsed time:   ', sorting_time)
            print('    surrogate elapsed time: ', surrogate_time)
            print('    metrics elapsed time:   ', time.time() - metrics_start_time)
            print('Epoch', num_iter+1, 'completed out of',num_total_iter_training, 'for', surr.name, 'loss train:',metrics.loss_opt_list_train[-1])
            print('Epoch', num_iter+1, 'completed out of',num_total_iter_training, 'for', surr.name, 'grad_w:',metrics.grad_w_list[-1])
            print('Epoch', num_iter+1, 'completed out of',num_total_iter_training, 'for', surr.name, 'microaucrelk train:',metrics.micro_auc_rel_k_list_train[-1])
        else:
            print('epoch', num_iter+1, 'completed. micro:{}'.format(metrics.micro_auc_rel_k_list_train[-1]))
    
        if(num_iter%num_iter_val == 0):

            loss_opt_val = 0
            k_minus_w_opt_val = 0

            for user_id in user_list_val_filtered:

                user_df_val = val_data[val_data['user'] == user_id]

                if(len(user_df_val.label.unique()) == 1):
                    if(user_df_val.label.iloc[0] == 1.0):
                        loss_opt_val += 0.0
                    else:
                        loss_opt_val += 1.0
                else:
                    beta = int(user_beta_val[user_id])

                    user_df_pos = user_df_val[user_df_val['label'] == 1]
                    user_df_neg = user_df_val[user_df_val['label'] == 0]

                    user_feat_pos = user_df_pos.drop(['user', 'label'], axis = 1).values
                    user_feat_neg = user_df_neg.drop(['user', 'label'], axis = 1).values

                    indices_pos, scores_pos = sort_order(user_feat_pos, w)
                    indices_neg, scores_neg = sort_order(user_feat_neg, w)

                    sorted_user_feat_pos = user_feat_pos[indices_pos, :] 
                    sorted_user_feat_neg = user_feat_neg[indices_neg, :] 

                    sorted_scores_pos = scores_pos[indices_pos]
                    sorted_scores_neg = scores_neg[indices_neg]

                    pi_opt_val, score_mat_val = surr.compute_pi(sorted_scores_pos, sorted_scores_neg, w, k, beta)
                    loss_opt_user_val, _, _, _ = surr.loss(
                        pi_opt_val, sorted_scores_pos, sorted_scores_neg, k, beta)

                    if draw and user_id == 0:
                        plt.subplot(1,2,1)
                        plt.imshow(score_mat_val)
                        plt.subplot(1,2,2)
                        plt.imshow(pi_opt_val)
                        plt.show()

                    loss_opt_val += loss_opt_user_val

            # sort data once for both micro
            user_feat = val_data.drop(['user', 'label'], axis = 1).values
            y_scores = user_feat.dot(w)
            data_true = deepcopy(val_data)
            data_true['scores'] = y_scores
            data_true = data_true.sort_values(by='scores', ascending=False)
            data_true = data_true.reset_index(drop=True)
        
            metrics.loss_opt_list_val.append(loss_opt_val/len(user_list_val_filtered))
            metrics.micro_auc_rel_k_list_val.append(utils.compute_micro(data_true, user_list_val_filtered, user_beta_val, w, k))

            if verbose:
                print('Epoch', num_iter+1, 'completed out of',num_total_iter_training, 'for', surr.name, 'loss val:',metrics.loss_opt_list_val[-1])
                print('Epoch', num_iter+1, 'completed out of',num_total_iter_training, 'for', surr.name, 'microaucrelk val:',metrics.micro_auc_rel_k_list_val[-1])
            else:
                print('    val micro:{}'.format(metrics.micro_auc_rel_k_list_val[-1]))

        prev_grad_w = momentum * prev_grad_w + (1-momentum) * grad_w
        
        w = w - (eta/np.sqrt(num_iter+1))*(prev_grad_w)

        if verbose:
            print('Epoch', num_iter+1, ' time taken is: ', time.time() - tic)
            print("\n")

        # also break if reached tolerance condition
        if num_iter >= 10 and max(metrics.loss_opt_list_train[-10:])-min(metrics.loss_opt_list_train[-10:]) <= tolerance:
            break

    best_iter = (np.where(np.asarray(metrics.loss_opt_list_train)==np.min(metrics.loss_opt_list_train))[0][0]//num_iter_val)*num_iter_val

    best_iter = (np.where(np.asarray(metrics.loss_opt_list_train)==np.min(metrics.loss_opt_list_train))[0][0]//num_iter_val)*num_iter_val
    best_microaucrelk = metrics.micro_auc_rel_k_list_val[best_iter//num_iter_val]
    print('Best micro aucrelk at iter: %d (metric: %f)' % (best_iter, best_microaucrelk))
    
    return metrics, w
def train(
    train_data,
    val_data, 
    user_list_train_filtered,
    user_list_val_filtered,
    surr,
    eta=0.1,
    momentum=0.0,
    lamb=0.1,
    num_iter_val=5,
    tolerance=1e-4,
    num_total_iter_training=21,
    draw=False,
    verbose=True,
    random_seed = 786,
    w=None):
    
    np.random.seed(random_seed)
    metrics = utils.Metrics()

    metrics.eta_lr = eta
    metrics.lamb_reg = lamb

    if w is None:
        w = np.random.normal(0, 1, (train_data.shape[1] - 2, ))
    
    prev_grad_w = np.zeros((train_data.shape[1] - 2, ))

    for num_iter in np.arange(num_total_iter_training):

        tic = time.time()

        metrics.w_list.append(w)

        loss_opt = 0
        k_minus_w_opt = 0
        grad_w = np.zeros((train_data.shape[1] - 2, ))
        
        sorting_time = 0
        surrogate_time = 0
        
        pi_opt_avg_user_b = 0
        pi_opt_avg_user_n = 0
        
        for user_id in user_list_train_filtered:

            user_df_train = train_data[train_data['user'] == user_id]

            if(len(user_df_train.label.unique()) == 1):
                if(user_df_train.label.iloc[0] == 1.0):
                    loss_opt += 0.0
                else:
                    loss_opt += 1.0
            else:

                sorting_start_time = time.time()
                beta = int(user_beta_train[user_id])

                user_df_pos = user_df_train[user_df_train['label'] == 1]
                user_df_neg = user_df_train[user_df_train['label'] == 0]

                user_feat_pos = user_df_pos.drop(['user', 'label'], axis = 1).values
                user_feat_neg = user_df_neg.drop(['user', 'label'], axis = 1).values

                indices_pos, scores_pos = sort_order(user_feat_pos, w)

                indices_neg, scores_neg = sort_order(user_feat_neg, w)

                sorted_user_feat_pos = user_feat_pos[indices_pos, :] 
                sorted_user_feat_neg = user_feat_neg[indices_neg, :] 
                sorted_scores_pos = scores_pos[indices_pos]
                sorted_scores_neg = scores_neg[indices_neg]
    
                sorting_time += time.time() - sorting_start_time

                surrogate_start_time = time.time()
                pi_opt, score_mat = surr.compute_pi(
                    sorted_scores_pos, sorted_scores_neg,
                    w, k, beta)
                
                pi_opt_avg_user_b += np.sum(pi_opt)/(beta*k)
                pi_opt_avg_user_n += np.sum(pi_opt)/(len(indices_pos)*k)
                
                loss_opt_user, _, _, _ = surr.loss(
                        pi_opt, sorted_scores_pos, sorted_scores_neg, k, beta)
                
                if draw and user_id == 0:
                    plt.subplot(1,2,1)
                    plt.imshow(score_mat)
                    plt.subplot(1,2,2)
                    plt.imshow(pi_opt)
                    plt.show()

                grad_w_user = surr.gradient(
                    sorted_user_feat_pos, sorted_user_feat_neg, pi_opt, k, beta)
                
                surrogate_time += time.time() - surrogate_start_time

                grad_w_user += lamb*w
                loss_opt += loss_opt_user

                grad_w += grad_w_user

        grad_w = grad_w/len(user_list_train_filtered)
        
        pi_opt_avg_user_b = pi_opt_avg_user_b/len(user_list_train_filtered)
        pi_opt_avg_user_n = pi_opt_avg_user_n/len(user_list_train_filtered)

        metrics_start_time = time.time()
                
        # sort data once for both micro
        user_feat = train_data.drop(['user', 'label'], axis = 1).values
        y_scores = user_feat.dot(w)
        data_true = deepcopy(train_data)
        data_true['scores'] = y_scores
        data_true = data_true.sort_values(by='scores', ascending=False)
        data_true = data_true.reset_index(drop=True)

        metrics.grad_w_list.append(np.linalg.norm(grad_w))
        metrics.loss_opt_list_train.append(loss_opt/len(user_list_train_filtered))
        metrics.micro_auc_rel_k_list_train.append(utils.compute_micro(data_true, user_list_train_filtered, user_beta_train, w, k))
        
        if verbose:
            print('k=', k,'Epoch', num_iter+1, 'done out of',num_total_iter_training, 'for', surr.name, 'loss train:',metrics.loss_opt_list_train[-1])
            print('k=', k,'Epoch', num_iter+1, 'done out of',num_total_iter_training, 'for', surr.name, 'grad_w:',metrics.grad_w_list[-1])
            print('k=', k,'Epoch', num_iter+1, 'done out of',num_total_iter_training, 'for', surr.name, 'microaucrelk train:',metrics.micro_auc_rel_k_list_train[-1])
            print('k=', k,'Epoch', num_iter+1, 'done out of',num_total_iter_training, 'for', surr.name, 'pi_opt_avg_user_b:',pi_opt_avg_user_b)
            print('k=', k,'Epoch', num_iter+1, 'done out of',num_total_iter_training, 'for', surr.name, 'pi_opt_avg_user_n:',pi_opt_avg_user_n)
        else:
            print('k=', k,'Epoch', num_iter+1, 'done. micro:{}'.format(metrics.micro_auc_rel_k_list_train[-1]))
    
        prev_grad_w = momentum * prev_grad_w + (1-momentum) * grad_w
        
        w = w - (eta/np.sqrt(num_iter+1))*(prev_grad_w)

        if verbose:
            print('Epoch', num_iter+1, ' time taken is: ', time.time() - tic)
            print("\n")

        # also break if reached tolerance condition
        if num_iter >= 10 and max(metrics.loss_opt_list_train[-10:])-min(metrics.loss_opt_list_train[-10:]) <= tolerance:
            break

    # save output to file
    best_iter = np.where(np.asarray(metrics.loss_opt_list_train)==np.min(metrics.loss_opt_list_train))[0][0]
    with open('../results/' + dataset + '/test_results/result-testcomp-{}-{}-{}-{}-{}.json'.format(eta, lamb, k, surr.name, random_seed), 'w') as fp:
        json.dump(metrics.to_dict(best_iter), fp)
    
    return metrics, w
print('../results/' + dataset + '/test_results/FinalTestResult-testcomp-{}.txt'.format(k))
orig_stdout = sys.stdout
f = open('../results/' + dataset + '/test_results/FinalTestResult-testcomp-{}.txt'.format(k), 'w')
sys.stdout = f

# compute test auc-rel-k at best epoch (from validation)
for name, metric in all_metrics:
    best_iter = np.where(np.asarray(metric.loss_opt_list_train)==np.min(metric.loss_opt_list_train))[0][0]
    w = metric.w_list[best_iter]
                 
    loss_opt_test = 0
    k_minus_w_opt_test = 0

    # sort data once for both micro 
    user_feat = test_data.drop(['user', 'label'], axis = 1).values
    y_scores = user_feat.dot(w)
    data_true = deepcopy(test_data)
    data_true['scores'] = y_scores
    data_true = data_true.sort_values(by='scores', ascending=False)
    data_true = data_true.reset_index(drop=True)

    print('k:', k)
    print('Dataset:', dataset)
    print('Name:', name)
    print('    ', loss_opt_test/len(user_list_test_filtered))
    print('    ', utils.compute_micro(data_true, user_list_test_filtered, user_beta_test, w, k))

sys.stdout = orig_stdout
f.close()

print("Total time taken is ", time.time() - tic_all)
Example #6
0
# compute val auc-rel-k at best epoch
for name, metric in all_metrics:
    best_iter = np.where(
        np.asarray(metric.loss_opt_list_train) == np.min(
            metric.loss_opt_list_train))[0][0]
    w = metric.w_list[best_iter]

    loss_opt_val = 0
    k_minus_w_opt_val = 0

    # sort data once for both micro
    user_feat = val_data.drop(['user', 'label'], axis=1).values
    y_scores = user_feat.dot(w)
    data_true = deepcopy(val_data)
    data_true['scores'] = y_scores
    data_true = data_true.sort_values(by='scores', ascending=False)
    data_true = data_true.reset_index(drop=True)

    print('k:', k)
    print('Dataset:', dataset)
    print('Name:', name)
    print('    ', loss_opt_val / len(user_list_val_filtered))
    print(
        '    ',
        utils.compute_micro(data_true, user_list_val_filtered, user_beta_val,
                            w, k))

sys.stdout = orig_stdout
f.close()