Example #1
0
    def test_submitAllUnsubmitted_unsubmittedSubmission_isSubmitted(self):
        unsubmitted = create_submission(submitted=None)
        self.submitter.dataAccess.save(unsubmitted)

        self.submitter.submit_all_unsubmitted()

        assert self.submitter.reddit.submit.called
Example #2
0
    def test_submitAllUnsubmitted_submittedSubmission_isNotSubmitted(self):
        submitted = create_submission()
        self.submitter.dataAccess.save(submitted)

        self.submitter.submit_all_unsubmitted()

        assert not self.submitter.reddit.submit.called
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
    else:
        meta_test = pd.read_csv(params.test_filepath)

    data = {'input': {'X': meta_test,
                      'y': None,
                      },
            }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['clipped_prediction']

    logger.info('creating submission...')
    submission = create_submission(meta_test, y_pred)

    logger.info('verifying submittion')
    sample_submission = pd.read_csv(params.sample_submission_filepath)
    verify_submission(submission, sample_submission)

    if dev_mode:
        logger.info('submittion can\'t be saved in dev mode')
    else:
        submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
        submission.to_csv(submission_filepath, index=None, encoding='utf-8')
        logger.info('submission saved to {}'.format(submission_filepath))
        logger.info('submission head \n\n{}'.format(submission.head()))
Example #4
0
    def infer_fold_TTA(self, fold_index, mode = 'max_map', Cycle = None):
        print(mode)
        val_loader = get_foldloader(self.image_size, self.batch_size/2, fold_index, mode='val')
        _, max_map, thres = self.val_TTA(fold_index, val_loader, is_load = True, mode = mode, Cycle = Cycle)
        if fold_index<0:
            return

        infer = self.get_infer_TTA(fold_index, thres)
        if Cycle is None:
            name_tmp = 'fold_{}_TTA_{}{:.3f}at{:.3f}.csv'.format(fold_index,mode,max_map,thres)
        else:
            name_tmp = 'fold_{}_Cycle_{}_TTA_{}{:.3f}at{:.3f}.csv'.format(fold_index, Cycle, mode, max_map, thres)
        output_name = os.path.join(self.model_save_path, 'fold_' + str(fold_index),name_tmp)
        submission = create_submission(infer)
        submission.to_csv(output_name, index=None)
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS
    else:
        TEST_DAYS_HOURS = eval(params.test_days_hours)

    meta_test_suplement = read_csv_time_chunks(params.test_chunks_dir,
                                               prefix='test',
                                               days_hours=TEST_DAYS_HOURS,
                                               usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                               dtype=cfg.COLUMN_TYPES['inference'],
                                               logger=logger)
    meta_test = pd.read_csv(params.test_filepath,
                            usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                            dtype=cfg.COLUMN_TYPES['inference'])
    meta_test_full = pd.concat([meta_test_suplement, meta_test], axis=0).reset_index(drop=True)
    meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN, keep='last', inplace=True)
    meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'], format='%Y-%m-%d %H:%M:%S')

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full)

    if dev_mode:
        meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE, replace=False)

    data = {'input': {'X': meta_test_full[cfg.FEATURE_COLUMNS],
                      'y': None,
                      },
            }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission full test')
    full_submission = create_submission(meta_test_full, y_pred)
    full_submission_filepath = os.path.join(params.experiment_dir, 'full_submission.csv')
    full_submission.to_csv(full_submission_filepath, index=None, encoding='utf-8')

    logger.info('subsetting submission')
    submission = pd.merge(full_submission, meta_test[cfg.ID_COLUMN], on=cfg.ID_COLUMN, how='inner')

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
Example #6
0
def ensemble_np(args, np_files, save_np=None):
    preds = []
    for np_file in np_files:
        pred = np.load(np_file)
        print(np_file, pred.shape)
        preds.append(pred)

    y_pred_test = generate_preds(np.mean(preds, 0),
                                 (settings.ORIG_H, settings.ORIG_W),
                                 args.pad_mode)

    if save_np is not None:
        np.save(save_np, np.mean(preds, 0))

    meta = get_test_loader(args.batch_size,
                           index=0,
                           dev_mode=False,
                           pad_mode=args.pad_mode).meta

    submission = create_submission(meta, y_pred_test)
    submission.to_csv(args.sub_file, index=None, encoding='utf-8')
    def infer_5fold(self):
        self.G.eval()
        test_dir = r'/data/shentao/Airbus/AirbusShipDetectionChallenge_384/test'
        test_loader = get_5foldloader(self.image_size, 1, 0, mode='test')

        predict_dict = {}
        for fold_index in range(5):
            self.load_pretrained_model(fold_index)
            for i, (id) in enumerate(test_loader):
                image_path = os.path.join(test_dir, id[0])
                output_mat = self.infer_one_img_from_path_8(image_path)
                output_mat = output_mat.reshape(
                    [self.image_size, self.image_size])

                output_mat[output_mat > 1.0] = 1.0
                output_mat[output_mat < 0.0] = 0.0

                if id[0] not in predict_dict:
                    predict_dict[id[0]] = output_mat
                else:
                    predict_dict[id[0]] += output_mat

                if i % 1000 == 0 and i > 0:
                    print(self.model_name + ' fold index: ' + str(fold_index) +
                          ' ' + str(i))

        out = []
        for id in predict_dict:
            output_mat = predict_dict[id] / 5.0
            output_mat[output_mat > 0.5] = 1
            output_mat[output_mat <= 0.5] = 0
            output_mat = output_mat.astype(np.uint8)
            out.append([id, output_mat])

        submission = create_submission(out, 768, 768)
        submission.to_csv(self.model_name + '_' + str(self.pretrained_model) +
                          '_5fold.csv',
                          index=None)
Example #8
0
def _predict(pipeline_name, dev_mode):
    logger.info('PREDICTION')
    logger.info('reading data...')
    if dev_mode:
        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
        application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
    else:
        application_test = pd.read_csv(params.test_filepath)

    data = {'input': {'X': application_test,
                      'y': None,
                      },
            }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    logger.info('Start pipeline transform')
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['clipped_prediction']

    if not dev_mode:
        logger.info('creating submission file...')
        submission = create_submission(application_test, y_pred)

        logger.info('verifying submission...')
        sample_submission = pd.read_csv(params.sample_submission_filepath)
        verify_submission(submission, sample_submission)

        submission_filepath = os.path.join(params.experiment_directory, 'submission.csv')
        submission.to_csv(submission_filepath, index=None, encoding='utf-8')
        logger.info('submission persisted to {}'.format(submission_filepath))
        logger.info('submission head \n\n{}'.format(submission.head()))

        if params.kaggle_api:
            logger.info('making Kaggle submit...')
            os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}'
                      .format(submission_filepath, params.kaggle_message))
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        meta_test = pd.read_csv(params.test_filepath,
                                usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                dtype=cfg.COLUMN_TYPES['inference'],
                                nrows=cfg.DEV_SAMPLE_TEST_SIZE)
    else:
        meta_test = pd.read_csv(params.test_filepath,
                                usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                                dtype=cfg.COLUMN_TYPES['inference'])

    meta_test['click_time'] = pd.to_datetime(meta_test['click_time'],
                                             format='%Y-%m-%d %H:%M:%S')

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test)

    data = {
        'input': {
            'X': meta_test[cfg.FEATURE_COLUMNS],
            'y': None,
        },
    }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission')
    submission = create_submission(meta_test, y_pred)

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
    else:
        meta_test = pd.read_csv(params.test_filepath)

    data = {'input': {'X': meta_test,
                      'y': None,
                      },
            }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['clipped_prediction']

    logger.info('creating submission test')
    submission = create_submission(meta_test, y_pred)
    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
Example #11
0
def _predict_in_chunks(pipeline_name, submit_predictions, dev_mode, chunk_size):
    meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage)))
    meta_test = meta[meta['is_test'] == 1]

    if dev_mode:
        meta_test = meta_test.sample(9, random_state=1234)

    logger.info('processing metadata of shape {}'.format(meta_test.shape))

    submission_chunks = []
    for meta_chunk in generate_data_frame_chunks(meta_test, chunk_size):
        data = {'input': {'meta': meta_chunk,
                          'meta_valid': None,
                          'train_mode': False,
                          'target_sizes': [(300, 300)] * len(meta_chunk)
                          },
                }

        pipeline = PIPELINES[pipeline_name]['inference'](SOLUTION_CONFIG)
        pipeline.clean_cache()
        output = pipeline.transform(data)
        pipeline.clean_cache()
        y_pred = output['y_pred']

        submission_chunk = create_submission(meta_chunk, y_pred, logger, CATEGORY_IDS)
        submission_chunks.extend(submission_chunk)

    submission_filepath = os.path.join(params.experiment_dir, 'submission.json')
    submission = submission_chunks
    with open(submission_filepath, "w") as fp:
        fp.write(json.dumps(submission))
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission[0]))

    if submit_predictions:
        _make_submission(submission_filepath)
Example #12
0
def main():
    #%% Reading data
    train = pd.read_csv('./data/cs-training.csv', index_col=0)
    X = train.drop('SeriousDlqin2yrs', axis=1)
    y = train.SeriousDlqin2yrs

    #Feature Engineering
    print(f'Starting shape: {X.shape}')
    X = (X.pipe(utils.replace_w_sensible_values).pipe(utils.replace_na).pipe(
        utils.log_transform_df).pipe(utils.add_AgeDecade).
         pipe(utils.add_boolean_DebtRatio_33).pipe(
             utils.add_boolean_DebtRatio_43).pipe(
                 utils.add_features_per_dependent).pipe(
                     utils.add_features_per_creditline).pipe(
                         utils.add_features_per_estate).pipe(
                             utils.add_features_distance_from_mean).pipe(
                                 utils.add_features_distance_from_median).pipe(
                                     utils.add_features_distance_from_std))
    print(f'Post feature engineering shape:{X.shape}')

    # %% Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=RANDOM_STATE, stratify=y)

    # %% LightGBM Classifier
    lgb_model = lgb.LGBMClassifier(
        silent=False,
        random_state=RANDOM_STATE,
        objective='binary',
        metrics='auc',
        boosting='gbdt',
        scale_pos_weight=13.960106382978724  #T/P-1
    )

    # %% Stratified Kfold parameters
    skf = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE)

    # %% Tuning Parameters for RandomSearch
    tuning_params = {
        'num_leaves': [5, 10, 15, 31, 40, 50],
        'scale_pos_weight': [1, 10, 14, 16],  # T/P-1 = 13.96
        'n_estimators': [100, 250, 500, 750, 1000],
        'learning_rate': [0.025, 0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': range(3, 6, 1),
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)],
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
    }

    gs = RandomizedSearchCV(estimator=lgb_model,
                            param_distributions=tuning_params,
                            n_iter=50,
                            scoring='roc_auc',
                            cv=skf,
                            refit=True,
                            verbose=True)

    gs.fit(X_train, y_train)

    # %% Extraction and selection of final features
    impt_features = utils.get_feature_importance(gs, X_train)
    impt_features.to_csv(f'output/{RANDOM_PREFIX}_impt_features.csv',
                         index=False)
    final_features = impt_features[
        impt_features['importance'] > 0].feature.values
    X_train = X_train.loc[:, final_features]
    X_test = X_test.loc[:, final_features]

    # %% Final Model
    best_lgb = lgb.LGBMClassifier().set_params(**gs.best_params_)
    best_lgb.fit(X_train, y_train)

    # %% Calculating model performance, plotting AUC and PRC curves
    utils.calculate_model_performance(best_lgb, X_train, X_test, y_train,
                                      y_test, RANDOM_PREFIX)

    # %% Create Submission for Kaggle
    utils.create_submission(best_lgb, final_features, RANDOM_PREFIX)
Example #13
0
def main(args):
    # 1. Prepare data & models

    # individual transformations for validation and test data did not give not reduce MSE-loss
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((SCALE_SIZE, SCALE_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.Grayscale(3),
                        ("image", )),  # grayscale image for best score
        TransformByKeys(transforms.ColorJitter(brightness=[0.8, 1.2]),
                        ("image", )),  # random choose brightness in range
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ("image", )),
    ])

    print('----------------------------------------------------------')
    print('Script for Kaggle competition "Thousand Facial Landmarks"')
    print('----------------------------------------------------------')

    print('Reading data...')
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                             train_transforms,
                                             split="train")
    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       num_workers=0,
                                       pin_memory=True,
                                       shuffle=True,
                                       drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                           train_transforms,
                                           split="val")
    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=args.batch_size,
                                     num_workers=0,
                                     pin_memory=True,
                                     shuffle=False,
                                     drop_last=False)

    print("Creating model...")
    device = torch.device(
        "cuda: 0"
    )  # default GPU device, because train this net on CPU is eternity :)

    # this network was selected through experimentation from the list: resnet18, resnet34, resnext50, resnext101, alexnet, InceptionV3, InceptionV4 etc
    model = models.resnext50_32x4d(pretrained=True)

    # adding new layers with regularization (dropout or batchnorm) did not give effect
    model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           amsgrad=True)
    loss_fn = fnn.mse_loss

    # 2. Train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss_fn,
                           optimizer,
                           device=device)
        val_loss = validate(model, val_dataloader, loss_fn, device=device)
        print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format(
            epoch, train_loss, val_loss))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(f"{args.name}_best.pth", "wb") as fp:
                torch.save(model.state_dict(), fp)

    # 3. Predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'),
                                            train_transforms,
                                            split="test")
    test_dataloader = data.DataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=0,
                                      pin_memory=True,
                                      shuffle=False,
                                      drop_last=False)

    with open(f"{args.name}_best.pth", "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(f"{args.name}_test_predictions.pkl", "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions, f"{args.name}_submit.csv")
Example #14
0
    dtrain=x_train,
    num_boost_round=132,
    #early_stopping_rounds=4,
    verbose_eval=1)
by_test = bst.predict(x_test)

# C
cx_train, cy_train, cx_test, c_idxs = get_data('c')
x_train = xgb.DMatrix(cx_train, label=cy_train)
x_test = xgb.DMatrix(cx_test)
neg_pos_rate = np.sum(cy_train == 0) / np.sum(cy_train == 1)
params = {
    'max_depth': 2,
    'eta': 0.1,  # learning rate
    # 'scale_pos_weight': neg_pos_rate,  # Balance classes ?
    'silent': 1,
    'objective': 'binary:logistic',
    'nthread': 4,
    'eval_metric': ['logloss']
}
bst = xgb.train(
    params=params,
    dtrain=x_train,
    num_boost_round=142,
    #early_stopping_rounds=10,
    verbose_eval=1)
cy_test = bst.predict(x_test)

# Create submission
create_submission(ay_test, by_test, cy_test, a_idxs, b_idxs, c_idxs)
Example #15
0
import models, utils, datasets, predict
import logging, sys

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

scales = {'Chicago': (0.78, 0.88, 1),
          'Chicago RAC': (0.6, 0.8, 0.1),
          'New Haven': (0.89, 1, 0.68),
          'New Haven RAC': (1, 1, 1),
          'Oakland': (0.84, 1, 0.51),
          'Oakland RAC': (1, 0.94, 0.23),
          'Richmond': (0.64, 1, 1),
          'Richmond RAC': (1, 1, 1)}
          
if __name__=='__main__':
    if len(sys.argv) == 1:
        print "No model name was given! Run again using format: \n\t",
        print "python test.py modelname"
    else:
        modelname = sys.argv[1]
        pred = models.test_model(modelname)
        categories = datasets.load_dataset('Categories')
        n_pred = pred.shape[0]
        pred = predict.apply_scales(pred, categories[-n_pred:], scales)
        name = modelname + ".csv"
        utils.create_submission(name, pred)
        print "Saved submission with name %s" %(name)
Example #16
0
def main(args):
    os.makedirs("runs", exist_ok=True)

    # 1. prepare data & models
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]),
            ("image", )),
    ])

    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"),
                                             train_transforms,
                                             split="train")
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=4,
                                  pin_memory=True,
                                  shuffle=True,
                                  drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"),
                                           train_transforms,
                                           split="val")
    val_dataloader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                num_workers=4,
                                pin_memory=True,
                                shuffle=False,
                                drop_last=False)

    device = torch.device(
        "cuda:0"
    )  # if args.gpu and torch.cuda.is_available() else torch.device("cpu")
    print("Creating model...")
    model = models.resnet18(pretrained=True)
    model.requires_grad_(False)

    model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)
    model.fc.requires_grad_(True)

    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           amsgrad=True)
    loss_fn = fnn.mse_loss
    time.sleep(60)
    # 2. train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss_fn,
                           optimizer,
                           device=device)
        val_loss = validate(model, val_dataloader, loss_fn, device=device)
        print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format(
            epoch, train_loss, val_loss))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(os.path.join("runs", f"{args.name}_best.pth"),
                      "wb") as fp:
                torch.save(model.state_dict(), fp)

    # 3. predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, "test"),
                                            train_transforms,
                                            split="test")
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=4,
                                 pin_memory=True,
                                 shuffle=False,
                                 drop_last=False)

    with open(os.path.join("runs", f"{args.name}_best.pth"), "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(os.path.join("runs", f"{args.name}_test_predictions.pkl"),
              "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions,
                      os.path.join("runs", f"{args.name}_submit.csv"))
Example #17
0
    # oof_prediction[val_] = la.predict(val_X)
    oof_prediction[val_] = light_gbm.transform(val_X)['prediction']
    oof_prediction[oof_prediction < 0] = 0
    # _preds = la.predict(test_X)
    _preds = light_gbm.transform(test_X)['prediction']
    _preds[_preds < 0 ] = 0
    sub_prediction += np.expm1(_preds) / len(folds)
    oof_scores.append(mean_squared_error(TARGET[val_], oof_prediction[val_])**0.5)
    print('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1]))
    gc.collect()

# Lasso
la = linear_model.Lasso()
TARGET = pd.DataFrame(TARGET)
for fold_, (trn_, val_) in enumerate(folds):
    trn_X, trn_y = train_X.iloc[trn_], TARGET.iloc[trn_]
    val_X, val_y = train_X.iloc[val_], TARGET.iloc[val_]

    la.fit(trn_X, trn_y)
    oof_prediction[val_] = la.predict(val_X)
    oof_prediction[oof_prediction < 0] = 0
    _preds = la.predict(test_X)
    _preds[_preds < 0 ] = 0
    sub_prediction += np.expm1(_preds) / len(folds)
    oof_scores.append(mean_squared_error(TARGET.iloc[val_], oof_prediction[val_])**0.5)
    print('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1]))
    gc.collect()

submission = utils.create_submission(sub_prediction, test)
submission.to_csv("first_trial.csv", index=False)
Example #18
0

prediction_2 = predict2(image_fps_val, min_conf=0.92, augment=False)

def merge_predictions(prediction, prediction_2):
    prediction_3 = copy.deepcopy(prediction)
    
    for patient_id in list(prediction_2.keys()):
        if len(prediction_2[patient_id])>0:
            prediction_3[patient_id] = []
    
    return prediction_3
    
prediction_3 = merge_predictions(prediction, prediction_2)

iou_all_mean,tp,fp,tn,fn = iou(truth, prediction_3)
print(iou_all_mean,tp,fp,tn,fn)  # 0.21805178140096618 248 235 948 69

# Prepare prediction set on test data
if True:
    image_fps_test = get_image_fps(TEST_DIR)
    image_fps_test.sort()

    prediction_test = predict(image_fps_test, min_conf=0.96, augment=True)
    prediction_test_2 = predict2(image_fps_test, min_conf=0.92, augment=False)
    prediction_test_3 = merge_predictions(prediction_test, prediction_test_2)
    
    create_submission(prediction_test_3)
    
submission = pd.read_csv('prediction.csv')
Example #19
0
def main(args):
    # 1. prepare data & models
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
            ("image", ),
        ),
    ])

    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(
        os.path.join(args.data, "train"),
        train_transforms,
        split="train",
        debug=args.debug,
    )
    train_dataloader = data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )
    val_dataset = ThousandLandmarksDataset(
        os.path.join(args.data, "train"),
        train_transforms,
        split="val",
        debug=args.debug,
    )
    val_dataloader = data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        pin_memory=True,
        shuffle=False,
        drop_last=False,
    )

    print("Creating model...")
    device = torch.device("cuda: 0") if args.gpu else torch.device("cpu")
    model = models.resnet50(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)
    model.to(device)

    for name, child in model.named_children():
        if name in ["fc"]:
            for param in child.parameters():
                param.requires_grad = True
        else:
            for param in child.parameters():
                param.requires_grad = False

    optimizer = optim.SGD(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=args.learning_rate,
        momentum=0.9,
        weight_decay=1e-04,
    )
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=0.1,
        steps_per_epoch=len(train_dataloader),
        epochs=args.epochs)
    loss = L.WingLoss(width=10, curvature=2, reduction="mean")

    # 2. train & validate
    print("Ready for training...")
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss,
                           optimizer,
                           device=device,
                           scheduler=scheduler)
        val_loss = validate(model, val_dataloader, loss, device=device)
        print("Epoch #{:2}:\ttrain loss: {:6.3}\tval loss: {:6.3}".format(
            epoch, train_loss, val_loss))

    # 2.1. train continued

    for p in model.parameters():
        p.requires_grad = True

    optimizer = optim.AdamW(
        [
            {
                "params": model.conv1.parameters(),
                "lr": 1e-6
            },
            {
                "params": model.bn1.parameters(),
                "lr": 1e-6
            },
            {
                "params": model.relu.parameters(),
                "lr": 1e-5
            },
            {
                "params": model.maxpool.parameters(),
                "lr": 1e-5
            },
            {
                "params": model.layer1.parameters(),
                "lr": 1e-4
            },
            {
                "params": model.layer2.parameters(),
                "lr": 1e-4
            },
            {
                "params": model.layer3.parameters(),
                "lr": 1e-3
            },
            {
                "params": model.layer4.parameters(),
                "lr": 1e-3
            },
            {
                "params": model.avgpool.parameters(),
                "lr": 1e-2
            },
            {
                "params": model.fc.parameters(),
                "lr": 1e-2
            },
        ],
        lr=args.learning_rate,
        weight_decay=1e-06,
        amsgrad=True,
    )

    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                               T_0=10,
                                                               T_mult=2)

    print("Ready for training again...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss,
                           optimizer,
                           device=device,
                           scheduler=scheduler)
        val_loss = validate(model, val_dataloader, loss, device=device)
        print("Epoch #{:2}:\ttrain loss: {:6.3}\tval loss: {:6.3}".format(
            epoch, train_loss, val_loss))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(f"{args.name}_best.pth", "wb") as fp:
                torch.save(model.state_dict(), fp)

    # 3. predict
    if not args.debug:
        test_dataset = ThousandLandmarksDataset(
            os.path.join(args.data, "test"),
            train_transforms,
            split="test",
            debug=args.debug,
        )
        test_dataloader = data.DataLoader(
            test_dataset,
            batch_size=args.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

        with open(f"submit/{args.name}_best.pth", "rb") as fp:
            best_state_dict = torch.load(fp, map_location="cpu")
            model.load_state_dict(best_state_dict)

        test_predictions = predict(model, test_dataloader, device)

        with open(f"submit/{args.name}_test_predictions.pkl", "wb") as fp:
            pickle.dump(
                {
                    "image_names": test_dataset.image_names,
                    "landmarks": test_predictions,
                },
                fp,
            )

        create_submission(args.data, test_predictions,
                          f"submit/{args.name}_submit.csv")
Example #20
0
	print('Dropping Id columns in data...', end='')
	Xtrain = Xtrain.drop(['Id'], axis=1)
	Ytrain = Ytrain.drop(Ytrain.columns[0], axis=1)
	Xtest_id = Xtest['Id'] #To save in sumission file
	Xtest = Xtest.drop(['Id'], axis=1)
	print('Done')

	# print('Xtrain', Xtrain)
	# print('ytrain', Ytrain.values.ravel())
	# print('Xtest', Xtest)

	log = {}
	print('Training with gradient_boosting_tree_model...', end='')
	Ytest, gbt_log = gradient_boosting_tree_model(Xtrain, Ytrain, Xtest)
	log['gbt_log'] = gbt_log
	create_submission(Xtest_id, Ytest, 'gradient_boosting_tree_model')
	print('Done')

	print('Training with random_forest_model...', end='')
	Ytest, rf_log = random_forest_model(Xtrain, Ytrain, Xtest)
	log['rf_log'] = rf_log
	create_submission(Xtest_id, Ytest, 'random_forest_model')
	print('Done')

	print('Training with extra_trees_model...', end='')
	Ytest, et_log = extra_trees_model(Xtrain, Ytrain, Xtest)
	log['et_log'] = et_log
	create_submission(Xtest_id, Ytest, 'extra_trees_model')
	print('Done')

	print('Training with xgboost_model...', end='')
Example #21
0
def main(args):
    os.makedirs("runs", exist_ok=True)

    # 1. prepare data & models
    # train_transforms = transforms.Compose([
    #     ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
    #     CropCenter(CROP_SIZE),
    #     TransformByKeys(transforms.ToPILImage(), ("image",)),
    #     TransformByKeys(transforms.ToTensor(), ("image",)),
    #     TransformByKeys(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ("image",)), # (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
    # ])

    crop_size = (224, 224)
    train_transforms = transforms.Compose([
        CropFrame(9),
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        FlipHorizontal(),
        Rotator(30),
        # CropRectangle(crop_size),
        ChangeBrightnessContrast(alpha_std=0.05, beta_std=10),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]), ("image", )),
    ])

    valid_transforms = transforms.Compose([
        CropFrame(9),
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        # CropRectangle(crop_size),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]), ("image", )),
    ])
    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"),
                                             train_transforms,
                                             split="train")
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=4,
                                  pin_memory=True,
                                  shuffle=True,
                                  drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"),
                                           valid_transforms,
                                           split="val")
    val_dataloader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                num_workers=4,
                                pin_memory=True,
                                shuffle=False,
                                drop_last=False)

    device = torch.device("cuda:0") if args.gpu and torch.cuda.is_available(
    ) else torch.device("cpu")

    print("Creating model...")
    # model = models.resnext50_32x4d(pretrained=True)
    # model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)
    # checkpoint = torch.load("./runs/baseline_full3_best.pth", map_location='cpu')
    # model.load_state_dict(checkpoint, strict=True)
    model = RESNEXT_steroid()
    model.to(device)
    for p in model.base_net.parameters():
        p.requires_grad = False
    # model.base_net[8].requires_grad = True
    for p in model.fc.parameters():
        p.requires_grad = True
    for p in model.linear7.parameters():
        p.requires_grad = True
    for p in model.attention.parameters():
        p.requires_grad = True
    for p in model.linear1.parameters():
        p.requires_grad = True
    # model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           amsgrad=True)
    # criterion = AdaptiveWingLoss()
    # criterion = torch.nn.MSELoss(size_average=True)
    # loss_fn = fnn.mse_loss
    criterion = fnn.l1_loss
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        mode='min',
                                                        factor=1 / np.sqrt(10),
                                                        patience=4,
                                                        verbose=True,
                                                        threshold=0.01,
                                                        threshold_mode='abs',
                                                        cooldown=0,
                                                        min_lr=1e-6,
                                                        eps=1e-08)

    # 2. train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           criterion,
                           optimizer,
                           device=device)
        val_loss, mse_loss = validate(model,
                                      val_dataloader,
                                      criterion,
                                      device=device)
        lr_scheduler.step(val_loss)
        print(
            "Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tmse loss: {:5.2}"
            .format(epoch, train_loss, val_loss, mse_loss))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(os.path.join("runs", f"{args.name}_best.pth"),
                      "wb") as fp:
                torch.save(model.state_dict(), fp)

    # 3. predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, "test"),
                                            train_transforms,
                                            split="test")
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=4,
                                 pin_memory=True,
                                 shuffle=False,
                                 drop_last=False)

    with open(os.path.join("runs", f"{args.name}_best.pth"), "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(os.path.join("runs", f"{args.name}_test_predictions.pkl"),
              "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions,
                      os.path.join("runs", f"{args.name}_submit.csv"))
Example #22
0
import models, utils, datasets, predict
import logging, sys

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

scales = {
    'Chicago': (0.78, 0.88, 1),
    'Chicago RAC': (0.6, 0.8, 0.1),
    'New Haven': (0.89, 1, 0.68),
    'New Haven RAC': (1, 1, 1),
    'Oakland': (0.84, 1, 0.51),
    'Oakland RAC': (1, 0.94, 0.23),
    'Richmond': (0.64, 1, 1),
    'Richmond RAC': (1, 1, 1)
}

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print "No model name was given! Run again using format: \n\t",
        print "python test.py modelname"
    else:
        modelname = sys.argv[1]
        pred = models.test_model(modelname)
        categories = datasets.load_dataset('Categories')
        n_pred = pred.shape[0]
        pred = predict.apply_scales(pred, categories[-n_pred:], scales)
        name = modelname + ".csv"
        utils.create_submission(name, pred)
        print "Saved submission with name %s" % (name)
Example #23
0
                datetime.utcnow().strftime('%Y-%m-%d_%H%M%S') 

logr.info('writing fit {} pipeline to disk as {}'.format(job, model_name)) 
try:
    joblib.dump(pipeline, os.path.join('saved_models', model_name) + '.pkl', compress=3)
except OverflowError, e:
    # this is annoying; look into it later 
    logr.warn('joblib write failed with error={}'.format(e)) 
    logr.info('proceeding with predictions without writing model to disk')

# do something useful with the fit model
if args.submission:
    # make predictions for a leaderboard submission
    logr.info('writing predictions to formatted submission file')
    utils.create_submission(predictions, 
                            pipeline_detail['name'], 
                            comment=pipeline_detail['note'])
else:
    # if we already did CV through the gridsearch, then just take 
    #   the best score and make predictions  
    if hasattr(pipeline, 'best_params_'):
        logr.info('predicting test values with best-choice gridsearch params')
        predictions = pipeline.predict(X_test) 
        # fake an array of CV scores to play nice with plot formatting later 
        scores = np.array([pipeline.best_score_])
    # otherwise, do some cross validation
    else:
        # otherwise, run a cross-validation for test accuracy
        cv = 3
        logr.info('cross validating model predictions with cv={}'.format(cv))
        predictions = cross_val_predict(pipeline, X_test, y_test, cv=cv)
def _predict(pipeline_name, dev_mode):
    logger.info('reading data in')
    if dev_mode:
        TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS
    else:
        TEST_DAYS_HOURS = eval(params.test_days_hours)

    meta_test_suplement = read_csv_time_chunks(
        params.test_chunks_dir,
        prefix='test',
        days_hours=TEST_DAYS_HOURS,
        usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
        dtype=cfg.COLUMN_TYPES['inference'],
        logger=logger)
    meta_test = pd.read_csv(params.test_filepath,
                            usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN,
                            dtype=cfg.COLUMN_TYPES['inference'])
    meta_test_full = pd.concat([meta_test_suplement, meta_test],
                               axis=0).reset_index(drop=True)
    meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN,
                                   keep='last',
                                   inplace=True)
    meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'],
                                                  format='%Y-%m-%d %H:%M:%S')

    data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full)

    if dev_mode:
        meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE,
                                               replace=False)

    data = {
        'input': {
            'X': meta_test_full[cfg.FEATURE_COLUMNS],
            'y': None,
        },
    }

    pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG)
    pipeline.clean_cache()
    output = pipeline.transform(data)
    pipeline.clean_cache()
    y_pred = output['y_pred']

    logger.info('creating submission full test')
    full_submission = create_submission(meta_test_full, y_pred)
    full_submission_filepath = os.path.join(params.experiment_dir,
                                            'full_submission.csv')
    full_submission.to_csv(full_submission_filepath,
                           index=None,
                           encoding='utf-8')

    logger.info('subsetting submission')
    submission = pd.merge(full_submission,
                          meta_test[cfg.ID_COLUMN],
                          on=cfg.ID_COLUMN,
                          how='inner')

    submission_filepath = os.path.join(params.experiment_dir, 'submission.csv')
    submission.to_csv(submission_filepath, index=None, encoding='utf-8')
    logger.info('submission saved to {}'.format(submission_filepath))
    logger.info('submission head \n\n{}'.format(submission.head()))
Example #25
0
def run(args):

    task = args['model']
    submit = args['submission']

    # 1.)Load data for training model
    X_train_full, y_train_full = utils.load_train_data(task)

    if submit:
        # making a submission; train on all given data
        print('fitting models to entire training set')
        X_train, y_train = X_train_full, y_train_full
        X_test = utils.load_test_data(task)
    else:
        # running an experiment - cross validate with train/test split
        test_size = args['test_size']
        print('fitting models to cv train/test split with train% = {}'.format(1-test_size))
        X_train, X_val, y_train, y_val = train_test_split(X_train_full,y_train_full, test_size=test_size, random_state=args['random_state'])


    # 2.) Get pipeline
    if task == 'Visit':
        pipeline_detail = visit[args['expt']]
        X_train, y_train = utils.sample_negatives(X_train, y_train, 2)
        if not submit:
            X_val, y_val = utils.sample_negatives(X_val, y_val, 1)
    else:
        pipeline_detail = rating[args['expt']]

    pipeline = pipeline_detail['pl']


    # Fit model to training data
    print('fitting model to array sizes (xtrain, ytrain)={}'.format([i.shape for i in [X_train, y_train]]))
    print('fitting experiment pipeline with signature={}'.format(pipeline))

    pipeline.fit(X_train, y_train)

    # 3.) For non-submission experiments, get the best parameters from grid search
    if submit:
        fname_spec = '_submission_'
    else:
        # log all results + call out the winner
        if hasattr(pipeline, 'best_params_'):
            print('best gridsearch score={}'.format(pipeline.best_score_))
            print('best set of pipeline params={}'.format(pipeline.best_params_))
            print('now displaying all pipeline param scores...')
            cv_results = pipeline.cv_results_
            for params, mean_score, scores in list(zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score'])):
                print("{:0.3f} (+/-{:0.03f}) for {}".format(mean_score, scores.std() * 2, params))
        fname_spec = '_expt_'

    model_name = utils.short_name(pipeline) + fname_spec + datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')


    # 4.) Prepare submission
    if submit:
        print('writing predictions to formatted submission file')
        predictions = pipeline.predict(X_test)
        if hasattr(pipeline, 'best_params_'):
            print('predicting test values with best-choice gridsearch params')
        utils.create_submission(predictions, pipeline_detail['name'], X_test)
    else:
        cv = args['k-fold']
        print('cross validating model predictions with cv={}'.format(cv))
        predictions = cross_val_predict(pipeline, X_val, y_val, cv=cv)

        # print("cross val prediction", accuracy_score(y_val, predictions))
        print("cross val prediction", mean_squared_error(y_val, predictions))

        predictions_train = pipeline.predict(X_train)
        predictions_test = pipeline.predict(X_val)

        if task == 'Visit':
            print('obtained train accuracy = {:.2f}, test accuracy = {:.2f}  pipeline={} '.format(
                accuracy_score(y_train, predictions_train),
                accuracy_score(y_val, predictions_test),
                pipeline))

            print('calculating confusion matrix')
            try:
                cf = confusion_matrix(y_val, predictions)
                print("confusion matrix: ", cf)
                sb.heatmap(cf)
            except RuntimeError as e:
                print('plotting error. matplotlib backend may need to be changed (see readme). error={}'.format(e))
                print('plot may still have been saved, and model has already been saved to disk.')
        else:
            print('obtained train mse = {:.2f} test mse={}, pipeline={} '.format(
                mean_squared_error(y_train, predictions_train),
                mean_squared_error(y_val, predictions_test),
                pipeline))

        if args['cross_val_score']:
            # this gives a better idea of uncertainty, but it adds 'cv' more
            print('cross validating model accuracy with cv={}'.format(cv))
            scores = cross_val_score(pipeline, X_val, y_val, cv=cv)
            print('obtained accuracy={:0.2f}% +/- {:0.2f} with cv={}, \
                                        pipeline={} '.format(scores.mean() * 100,
                                                             scores.std() * 100 * 2,
                                                             cv,
                                                             pipeline))



    print('completed with pipeline {}'.format(pipeline_detail['name']))