Exemple #1
0
    def fit_and_predict(self, X_train, X_test, y_train):
        if self.cv == "mcs":
            folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100)
        oof = np.zeros((len(X_train), NUM_CLASS))
        predictions = np.zeros((len(X_test), NUM_CLASS))
        feature_importance_df = pd.DataFrame()
        fold_scores = []

        for fold, (train_idx, val_idx) in enumerate(
                folds.split(df=y_train, target_cols=["target"])):
            self.logger.debug("-" * 100)
            self.logger.debug(f"Fold {fold+1}")
            train_data = lgb.Dataset(X_train.iloc[train_idx],
                                     label=y_train.iloc[train_idx])
            val_data = lgb.Dataset(X_train.iloc[val_idx],
                                   label=y_train.iloc[val_idx])
            callbacks = [log_evaluation(self.logger, period=100)]
            clf = lgb.train(self.params,
                            train_data,
                            valid_sets=[train_data, val_data],
                            verbose_eval=100,
                            early_stopping_rounds=100,
                            callbacks=callbacks,
                            feval=eval_func)
            oof[val_idx, :] = clf.predict(X_train.iloc[val_idx].values,
                                          num_iteration=clf.best_iteration)
            fold_score = top2accuracy(oof[val_idx, :],
                                      y_train.iloc[val_idx].values)
            fold_scores.append(fold_score)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X_train.columns.values
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type="gain")
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)

            predictions += clf.predict(
                X_test, num_iteration=clf.best_iteration) / folds.n_splits

        pred_labels = np.argsort(predictions, axis=1)[:, -2:]

        feature_importance_df = feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False).head(50)
        self.logger.debug("##### feature importance #####")
        self.logger.debug(feature_importance_df)
        cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}")
        return pred_labels, cv_score_fold_mean
Exemple #2
0

#################### 
## Train model
#################### 
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train_use))
predictions = np.zeros(len(test_use))
feature_importance_df = pd.DataFrame()

for fold, (train_idx, val_idx) in enumerate(folds.split(train_use, train_use["district"])):
    print(f"Fold {fold+1}")
    train_data = lgb.Dataset(train_use.iloc[train_idx], label=target_log[train_idx], categorical_feature=categorical_cols)
    val_data = lgb.Dataset(train_use.iloc[val_idx], label=target_log[val_idx], categorical_feature=categorical_cols)
    num_round = N_ROUNDS
    callbacks = [log_evaluation(logger, period=100)]
    clf = lgb.train(params, train_data, num_round, valid_sets = [train_data, val_data], verbose_eval=False, early_stopping_rounds=100, callbacks=callbacks)
    oof[val_idx] = clf.predict(train_use.values[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = train_use.columns.values
    fold_importance_df["importance"] = clf.feature_importance(importance_type="gain")
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50)
    logger.debug("##### feature importance #####")
    logger.debug(feature_importance_df)
    
    predictions += clf.predict(test_use, num_iteration=clf.best_iteration) / folds.n_splits
    
# inverse log transformation
Exemple #3
0
def train(args, train_loader, train_val_loader, val_loader, test_loader):
    seed(args.seed)
    job_id = os.environ.get('SLURM_JOB_ID', 'local')

    print('Starting run {} with:\n{}'.format(job_id, args))

    writer = SummaryWriter(args.logdir)

    columns = ['epoch', 'eval_loss', 'eval_acc', 'eval_prec', 'eval_recall',
               'train_loss', 'train_acc', 'train_prec', 'train_recall',
               'test_loss', 'test_acc', 'test_prec', 'test_recall']
    stats_csv = pd.DataFrame(columns=columns)

    model = Network(
        k=args.network_k, att_type=args.network_att_type, kernel3=args.kernel3,
        width=args.network_width, dropout=args.network_dropout, compensate=True,
        norm=args.norm, inp_channels=args.input_channels)

    print(model)

    epochs = args.num_epochs * args.shrinkage
    milestones = np.array([80, 120, 160])
    milestones *= args.shrinkage
    milestones = list(milestones)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    raw_model = model
    if torch.cuda.device_count() > 1:
        print('using multiple gpus')
        model = torch.nn.DataParallel(model)
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    print(criterion)
    nn.utils.clip_grad_value_(raw_model.parameters(), 5.)
    if args.opt == 'rmsprop':
        optimizer = torch.optim.RMSprop(raw_model.parameters(), lr=args.lr, eps=1e-5, weight_decay=args.l2)
    elif args.opt == 'momentum':
        optimizer = torch.optim.SGD(raw_model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.l2)
    elif args.opt == 'adam':
        optimizer = torch.optim.Adam(raw_model.parameters(), lr=args.lr, eps=1e-5, weight_decay=args.l2)
    lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones)

    state = {
        'epoch': 0,
        'step': 0,
        'state_dict': copy.deepcopy(raw_model.state_dict()),
        'optimizer': copy.deepcopy(optimizer.state_dict()),
        'lr_schedule': copy.deepcopy(lr_schedule.state_dict()),
        'best_acc': None,
        'best_epoch': 0,
        'is_best': False,
        'stats_csv': stats_csv,
        'config': vars(args)
    }

    if load_checkpoint(args.logdir, state):
        raw_model.load_state_dict(state['state_dict'])
        optimizer.load_state_dict(state['optimizer'])
        lr_schedule.load_state_dict(state['lr_schedule'])
        stats_csv = state['stats_csv']

    save_checkpoint(args.logdir, state)

    writer.add_text('args/str', str(args), state['epoch'])
    writer.add_text('job_id/str', job_id, state['epoch'])
    writer.add_text('model/str', str(model), state['epoch'])

    # Train the model
    for epoch in range(state['epoch'], epochs):
        lr_schedule.step()
        model.train()

        losses = []
        tps = []
        tns = []
        fps = []
        fns = []
        batch_labels = []
        delayed = 0
        writer.add_scalar('stats/lr', optimizer.param_groups[0]['lr'], epoch + 1)
        with tqdm(train_loader, desc="Epoch [{}/{}]".format(epoch+1, epochs)) as pbar:
            for images, labels in pbar:
                batch_labels += list(labels)
                if torch.cuda.is_available():
                    if torch.cuda.device_count() == 1:
                        images = images.cuda()
                    labels = labels.cuda()
                # Forward pass
                outputs, att = model(images)
                loss = criterion(outputs, labels)
                predicted = torch.argmax(outputs.data, 1)

                TP, TN, FP, FN = pred_stats(predicted, labels)
                cpu_loss = loss.mean().cpu().item()

                losses += [cpu_loss]
                tps += [TP]
                tns += [TN]
                fps += [FP]
                fns += [FN]
                # Backward and optimize
                delayed += 1
                if args.delayed_step > 0:
                    (loss / args.delayed_step).backward()
                else:
                    loss.backward()

                if args.delayed_step == 0 or (delayed + 1) % args.delayed_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                    precision, recall, accuracy = precision_recall_accuracy(
                        np.sum(tps), np.sum(tns), np.sum(fps), np.sum(fns))

                    writer.add_scalar('train/loss', np.mean(losses), state['step'])
                    writer.add_scalar('train/precision', precision, state['step'])
                    writer.add_scalar('train/recall', recall, state['step'])
                    writer.add_scalar('train/accuracy', accuracy, state['step'])
                    writer.add_scalar('train/labels', np.mean(batch_labels), state['step'])
                    state['step'] += 1

                    delayed = 0
                    losses = []
                    tps = []
                    tns = []
                    fps = []
                    fns = []
                    batch_labels = []

                pbar.set_postfix(loss=cpu_loss)

        # step last backward if the step isn't done yet because of an 'incomplete'
        # delayed / accumulated batch
        if delayed > 0:
            optimizer.step()
            optimizer.zero_grad()

            precision, recall, accuracy = precision_recall_accuracy(
                np.sum(tps), np.sum(tns), np.sum(fps), np.sum(fns))

            writer.add_scalar('train/loss', np.mean(losses), state['step'])
            writer.add_scalar('train/precision', precision, state['step'])
            writer.add_scalar('train/recall', recall, state['step'])
            writer.add_scalar('train/accuracy', accuracy, state['step'])
            writer.add_scalar('train/labels', np.mean(batch_labels), state['step'])
            state['step'] += 1

        state['epoch'] = epoch + 1
        state['state_dict'] = copy.deepcopy(raw_model.state_dict())
        state['optimizer'] = copy.deepcopy(optimizer.state_dict())
        state['lr_schedule'] = copy.deepcopy(lr_schedule.state_dict())

        if args.opt == 'rmsprop':
            rms_m2 = get_rmsprop_m2(model, optimizer)
            writer.add_scalar('train/rmsprop_m2_min', rms_m2.min(), state['epoch'])
            writer.add_scalar('train/rmsprop_m2_mean', rms_m2.mean(), state['epoch'])
            writer.add_scalar('train/rmsprop_m2_max', rms_m2.max(), state['epoch'])
            writer.add_histogram('train/rmsprop_m2', rms_m2, state['epoch'])

        val_stats = evaluate(model, criterion, val_loader)
        log_evaluation(state['epoch'], val_stats, writer, 'eval')

        if state['best_acc'] is None or state['best_acc'] < val_stats['accuracy']:
            state['is_best'] = True
            state['best_acc'] = val_stats['accuracy']
            state['best_epoch'] = state['epoch']
        else:
            state['is_best'] = False

        if (state['is_best'] or state['epoch'] >= epochs or args.test_all):
            train_stats = evaluate(model, criterion, train_val_loader)
            log_evaluation(state['epoch'], train_stats, writer, 'train_eval')

            test_stats = evaluate(model, criterion, test_loader)
            log_evaluation(state['epoch'], test_stats, writer, 'test')

            stats_csv.loc[len(stats_csv)] = [
                state['epoch'], val_stats['loss'], val_stats['accuracy'],
                val_stats['precision'], val_stats['recall'],
                train_stats['loss'], train_stats['accuracy'],
                train_stats['precision'], train_stats['recall'],
                test_stats['loss'], test_stats['accuracy'],
                test_stats['precision'], test_stats['recall']]
        else:
            stats_csv.loc[len(stats_csv)] = [
                state['epoch'], val_stats['loss'], val_stats['accuracy'],
                val_stats['precision'], val_stats['recall'],
                np.nan, np.nan, np.nan, np.nan,
                np.nan, np.nan, np.nan, np.nan]

        save_checkpoint(args.logdir, state)

    writer.add_text('done/str', 'true', state['epoch'])

    print('done - stopping now')

    writer.close()
Exemple #4
0
    def fit_and_predict(self, X_train, X_test, y_train, groups):
        if self.cv == "mcs":
            folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100)
        elif self.cv == "group":
            folds = GroupKFold(n_splits=10)
        elif self.cv == "stratified":
            folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            y_to_stratify = pd.cut(y_train["Global_Sales_log1p"],
                                   bins=7,
                                   labels=False)

        oof = np.zeros(len(X_train))
        predictions = np.zeros(len(X_test))
        feature_importance_df = pd.DataFrame()
        fold_scores = []

        # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)):
        for fold, (train_idx,
                   val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
            self.logger.debug("-" * 100)
            self.logger.debug(f"Fold {fold+1}")
            train_data = lgb.Dataset(X_train.iloc[train_idx],
                                     label=y_train.iloc[train_idx])
            val_data = lgb.Dataset(X_train.iloc[val_idx],
                                   label=y_train.iloc[val_idx])
            callbacks = [log_evaluation(self.logger, period=100)]
            clf = lgb.train(self.params,
                            train_data,
                            valid_sets=[train_data, val_data],
                            verbose_eval=100,
                            early_stopping_rounds=100,
                            callbacks=callbacks)  #, feval=eval_func)
            oof[val_idx] = clf.predict(X_train.iloc[val_idx].values,
                                       num_iteration=clf.best_iteration)
            fold_score = mean_squared_log_error(
                np.expm1(y_train.iloc[val_idx].values),
                np.expm1(oof[val_idx]))**.5
            fold_scores.append(fold_score)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X_train.columns.values
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type="gain")
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)

            predictions += np.expm1(
                clf.predict(X_test,
                            num_iteration=clf.best_iteration)) / folds.n_splits

        _feature_importance_df = feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False)  # .head(50)
        self.logger.debug("##### feature importance #####")
        self.logger.debug(_feature_importance_df.head(50))
        cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}")

        # # RETRAIN
        # # exp057
        # # RETRAIN

        # k = 500
        # topk_features = _feature_importance_df.index[:k]
        # self.logger.debug(f"selected {len(topk_features)} features: {topk_features}")

        # oof = np.zeros(len(X_train))
        # predictions = np.zeros(len(X_test))
        # feature_importance_df = pd.DataFrame()
        # fold_scores = []

        # # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)):
        # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
        #     self.logger.debug("-" * 100)
        #     self.logger.debug(f"Fold {fold+1}")
        #     train_data = lgb.Dataset(X_train.loc[train_idx, topk_features], label=y_train.iloc[train_idx])
        #     val_data = lgb.Dataset(X_train.loc[val_idx, topk_features], label=y_train.iloc[val_idx])
        #     callbacks = [log_evaluation(self.logger, period=100)]
        #     clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks)  #, feval=eval_func)
        #     oof[val_idx] = clf.predict(X_train.loc[val_idx, topk_features].values, num_iteration=clf.best_iteration)
        #     fold_score = mean_squared_log_error(np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx])) ** .5
        #     fold_scores.append(fold_score)

        #     fold_importance_df = pd.DataFrame()
        #     fold_importance_df["feature"] = topk_features
        #     fold_importance_df["importance"] = clf.feature_importance(importance_type="gain")
        #     fold_importance_df["fold"] = fold + 1
        #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        #     predictions += np.expm1(clf.predict(X_test[topk_features], num_iteration=clf.best_iteration)) / folds.n_splits

        # feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50)
        # self.logger.debug("##### feature importance #####")
        # self.logger.debug(feature_importance_df)
        # cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        # self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}")

        return predictions, cv_score_fold_mean