def train_model(): global best_acc, best_epoch batch_idx = 0 model.train() N = len(train_loader.dataset) train_loss, all_preds, all_targets = 0., [], [] val_preds, val_targets = [], [] for batch in train_loader: optimizer.zero_grad() loss, output = model(batch) # if params.task == '1': target = batch['labels'].numpy() valid_mask = batch['valid_mask'].numpy() test_mask = batch['test_mask'].numpy() validation_flag = (1 - valid_mask) * test_mask training_flag = test_mask * valid_mask elif params.task == '2': target = batch['ans'].numpy() valid_mask = batch['valid_mask'].numpy() test_mask = batch['test_mask'].numpy() validation_flag = (1 - valid_mask) * test_mask training_flag = test_mask * valid_mask loss.backward() optimizer.step() all_preds.append(output[training_flag == 1]) all_targets.append(target[training_flag == 1]) val_preds.append(output[validation_flag == 1]) val_targets.append(target[validation_flag == 1]) train_loss += float(loss.detach().cpu().numpy()) batch_idx += 1 all_pred = np.concatenate(all_preds, axis=0) all_target = np.concatenate(all_targets, axis=0) val_pred = np.concatenate(val_preds, axis=0) val_target = np.concatenate(val_targets, axis=0) #model.eval() if params.task == '1': train_auc = compute_auc(all_target, all_pred) val_auc = compute_auc(val_target, val_pred) train_accuracy = compute_accuracy(all_target, all_pred) val_accuracy = compute_accuracy(val_target, val_pred) print( 'Train Epoch {} Loss: {} train auc: {} train acc: {} val auc: {} val accuracy: {} n_validation : {}' .format(epoch, train_loss / batch_idx, train_auc, train_accuracy, val_auc, val_accuracy, val_target.shape)) if params.task == '2': train_accuracy = np.mean(all_target == all_pred) val_accuracy = np.mean(val_target == val_pred) print('Train Epoch {} Loss: {} train acc: {} val accuracy: {}'.format( epoch, train_loss / batch_idx, train_accuracy, val_accuracy)) if best_acc is None or val_accuracy > best_acc: best_acc = val_accuracy best_epoch = epoch print('Train Epoch {} best val accuracy: {} best epoch: {}'.format( epoch, best_acc, best_epoch))
def compute_bpsn_auc(df, subgroup, label, model_name): """Computes the AUC of the within-subgroup negative examples and the background positive examples.""" subgroup_negative_examples = df[df[subgroup] & ~df[label]] non_subgroup_positive_examples = df[~df[subgroup] & df[label]] examples = subgroup_negative_examples.append( non_subgroup_positive_examples) return compute_auc(examples[label], examples[model_name])
def test_model(id_): valid_dataset.seed = id_ valid_loader = torch.utils.data.DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=64, num_workers=num_workers, shuffle=False, drop_last=False) total_loss, all_preds, all_targets = 0., [], [] n_batch = 0 for batch in valid_loader: with torch.no_grad(): output = model.test(batch) target = batch['output_labels'].float().numpy() mask = batch['output_mask'].numpy() == 1 all_preds.append(output[mask]) all_targets.append(target[mask]) n_batch += 1 all_pred = np.concatenate(all_preds, axis=0) all_target = np.concatenate(all_targets, axis=0) auc = compute_auc(all_target, all_pred) accuracy = compute_accuracy(all_target, all_pred) return total_loss / n_batch, auc, accuracy
def compute_subset_auc(indices, pred_set, y): subset = [vect for i, vect in enumerate(pred_set) if i in indices] mean_preds = sp.mean(subset, axis=0) mean_auc = compute_auc(y, mean_preds) return mean_auc, indices
def fit_predict(self, y, train=None, predict=None, show_steps=True): """ Fit each model on the appropriate dataset, then return the average of their individual predictions. If train is specified, use a subset of the training set to train the models, then predict the outcome of either the remaining samples or (if given) those specified in predict. If train is omitted, train the models on the full training set, then predict the outcome of the full test set. Options: ------------------------------ - y: numpy array. The full vector of the ground truths. - train: list. The indices of the elements to be used for training. If None, take the entire training set. - predict: list. The indices of the elements to be predicted. - show_steps: boolean. Whether to compute metrics after each stage of the computation. """ y_train = y[train] if train is not None else y if train is not None and predict is None: predict = [i for i in range(len(y)) if i not in train] stage0_train = [] stage0_predict = [] for model, feature_set in self.models: X_train, X_predict = get_dataset(feature_set, train, predict) identifier = train[0] if train is not None else -1 cache_file = stringify(model, feature_set) + str(identifier) model_preds = self._get_model_preds( model, X_train, X_predict, y_train, cache_file) stage0_predict.append(model_preds) # if stacking, compute cross-validated predictions on the train set if self.stack: model_cv_preds = self._get_model_cv_preds( model, X_train, y_train, cache_file) stage0_train.append(model_cv_preds) # verbose mode: compute metrics after every model computation if show_steps: if train is not None: mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, train, predict, stack=self.stack, fwls=self.fwls) model_auc = compute_auc(y[predict], stage0_predict[-1]) mean_auc = compute_auc(y[predict], mean_preds) stack_auc = compute_auc(y[predict], stack_preds) \ if self.stack else 0 fwls_auc = compute_auc(y[predict], fwls_preds) \ if self.fwls else 0 logger.info( "> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc, mean_auc, stack_auc, fwls_auc, stringify(model, feature_set)) else: logger.info("> used model %s:\n%s", stringify( model, feature_set), model.get_params()) if self.model_selection and predict is not None: best_subset = self._find_best_subset(y[predict], stage0_predict) stage0_train = [pred for i, pred in enumerate(stage0_train) if i in best_subset] stage0_predict = [pred for i, pred in enumerate(stage0_predict) if i in best_subset] mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, stack=self.stack, fwls=self.fwls) if self.stack: selected_preds = stack_preds if not self.fwls else fwls_preds else: selected_preds = mean_preds return selected_preds
def fit_predict(self, y, train=None, predict=None, show_steps=True): """ Fit each model on the appropriate dataset, then return the average of their individual predictions. If train is specified, use a subset of the training set to train the models, then predict the outcome of either the remaining samples or (if given) those specified in cv. If train is omitted, train the models on the full training set, then predict the outcome of the full test set. Options: ------------------------------ - y: numpy array. The full vector of the ground truths. - train: list. The indices of the elements to be used for training. If None, take the entire training set. - predict: list. The indices of the elements to be predicted. - show_steps: boolean. Whether to compute metrics after each stage of the computation. """ y_train = y[train] if train is not None else y if train is not None and predict is None: predict = [i for i in range(len(y)) if i not in train] stage0_train = [] stage0_predict = [] for model, feature_set in self.models: X_train, X_predict = get_dataset(feature_set, train, predict) identifier = train[0] if train is not None else -1 cache_file = stringify(model, feature_set) + str(identifier) model_preds = self._get_model_preds(model, X_train, X_predict, y_train, cache_file) stage0_predict.append(model_preds) # if stacking, compute cross-validated predictions on the train set if self.stack: model_cv_preds = self._get_model_cv_preds( model, X_train, y_train, cache_file) stage0_train.append(model_cv_preds) # verbose mode: compute metrics after every model computation if show_steps: if train is not None: mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, train, predict, stack=self.stack, fwls=self.fwls) model_auc = compute_auc(y[predict], stage0_predict[-1]) mean_auc = compute_auc(y[predict], mean_preds) stack_auc = compute_auc(y[predict], stack_preds) \ if self.stack else 0 fwls_auc = compute_auc(y[predict], fwls_preds) \ if self.fwls else 0 logger.info("> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc, mean_auc, stack_auc, fwls_auc, stringify(model, feature_set)) else: logger.info("> used model %s:\n%s", stringify(model, feature_set), model.get_params()) if self.model_selection and predict is not None: best_subset = self._find_best_subset(y[predict], stage0_predict) stage0_train = [ pred for i, pred in enumerate(stage0_train) if i in best_subset ] stage0_predict = [ pred for i, pred in enumerate(stage0_predict) if i in best_subset ] mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, stack=self.stack, fwls=self.fwls) if self.stack: selected_preds = stack_preds if not self.fwls else fwls_preds else: selected_preds = mean_preds return selected_preds
random.shuffle(trainset1) random.shuffle(testset1) random.shuffle(trainset2) random.shuffle(testset2) trainset1 = trainset1[:len(trainset1)//batch_size*batch_size] testset1 = testset1[:len(testset1)//batch_size*batch_size] trainset2 = trainset2[:len(trainset2)//batch_size*batch_size] testset2 = testset2[:len(testset2)//batch_size*batch_size] gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model = Model(user_count, item_count, batch_size) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('Domain_A_Initialized_AUC: %.4f\tDomain_B_Initialized_AUC: %.4f' % compute_auc(sess, model, testset1, testset2)) sys.stdout.flush() start_time = time.time() last_auc = 0.0 for _ in range(1000): loss_sum = 0.0 for uij in DataInput(trainset1, batch_size): loss = model.train_1(sess, uij, lr) loss_sum += loss for uij in DataInput(trainset2, batch_size): loss = model.train_2(sess, uij, lr) loss_sum += loss model.train_orth(sess, uij[0], lr) test_auc_1, test_auc_2 = compute_auc(sess, model, testset1, testset2) train_auc_1, train_auc_2 = compute_auc(sess, model, trainset1, trainset2)