def test_submitAllUnsubmitted_unsubmittedSubmission_isSubmitted(self): unsubmitted = create_submission(submitted=None) self.submitter.dataAccess.save(unsubmitted) self.submitter.submit_all_unsubmitted() assert self.submitter.reddit.submit.called
def test_submitAllUnsubmitted_submittedSubmission_isNotSubmitted(self): submitted = create_submission() self.submitter.dataAccess.save(submitted) self.submitter.submit_all_unsubmitted() assert not self.submitter.reddit.submit.called
def _predict(pipeline_name, dev_mode): logger.info('reading data in') if dev_mode: meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) else: meta_test = pd.read_csv(params.test_filepath) data = {'input': {'X': meta_test, 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['clipped_prediction'] logger.info('creating submission...') submission = create_submission(meta_test, y_pred) logger.info('verifying submittion') sample_submission = pd.read_csv(params.sample_submission_filepath) verify_submission(submission, sample_submission) if dev_mode: logger.info('submittion can\'t be saved in dev mode') else: submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head()))
def infer_fold_TTA(self, fold_index, mode = 'max_map', Cycle = None): print(mode) val_loader = get_foldloader(self.image_size, self.batch_size/2, fold_index, mode='val') _, max_map, thres = self.val_TTA(fold_index, val_loader, is_load = True, mode = mode, Cycle = Cycle) if fold_index<0: return infer = self.get_infer_TTA(fold_index, thres) if Cycle is None: name_tmp = 'fold_{}_TTA_{}{:.3f}at{:.3f}.csv'.format(fold_index,mode,max_map,thres) else: name_tmp = 'fold_{}_Cycle_{}_TTA_{}{:.3f}at{:.3f}.csv'.format(fold_index, Cycle, mode, max_map, thres) output_name = os.path.join(self.model_save_path, 'fold_' + str(fold_index),name_tmp) submission = create_submission(infer) submission.to_csv(output_name, index=None)
def _predict(pipeline_name, dev_mode): logger.info('reading data in') if dev_mode: TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS else: TEST_DAYS_HOURS = eval(params.test_days_hours) meta_test_suplement = read_csv_time_chunks(params.test_chunks_dir, prefix='test', days_hours=TEST_DAYS_HOURS, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference'], logger=logger) meta_test = pd.read_csv(params.test_filepath, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference']) meta_test_full = pd.concat([meta_test_suplement, meta_test], axis=0).reset_index(drop=True) meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN, keep='last', inplace=True) meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'], format='%Y-%m-%d %H:%M:%S') data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full) if dev_mode: meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE, replace=False) data = {'input': {'X': meta_test_full[cfg.FEATURE_COLUMNS], 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['y_pred'] logger.info('creating submission full test') full_submission = create_submission(meta_test_full, y_pred) full_submission_filepath = os.path.join(params.experiment_dir, 'full_submission.csv') full_submission.to_csv(full_submission_filepath, index=None, encoding='utf-8') logger.info('subsetting submission') submission = pd.merge(full_submission, meta_test[cfg.ID_COLUMN], on=cfg.ID_COLUMN, how='inner') submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head()))
def ensemble_np(args, np_files, save_np=None): preds = [] for np_file in np_files: pred = np.load(np_file) print(np_file, pred.shape) preds.append(pred) y_pred_test = generate_preds(np.mean(preds, 0), (settings.ORIG_H, settings.ORIG_W), args.pad_mode) if save_np is not None: np.save(save_np, np.mean(preds, 0)) meta = get_test_loader(args.batch_size, index=0, dev_mode=False, pad_mode=args.pad_mode).meta submission = create_submission(meta, y_pred_test) submission.to_csv(args.sub_file, index=None, encoding='utf-8')
def infer_5fold(self): self.G.eval() test_dir = r'/data/shentao/Airbus/AirbusShipDetectionChallenge_384/test' test_loader = get_5foldloader(self.image_size, 1, 0, mode='test') predict_dict = {} for fold_index in range(5): self.load_pretrained_model(fold_index) for i, (id) in enumerate(test_loader): image_path = os.path.join(test_dir, id[0]) output_mat = self.infer_one_img_from_path_8(image_path) output_mat = output_mat.reshape( [self.image_size, self.image_size]) output_mat[output_mat > 1.0] = 1.0 output_mat[output_mat < 0.0] = 0.0 if id[0] not in predict_dict: predict_dict[id[0]] = output_mat else: predict_dict[id[0]] += output_mat if i % 1000 == 0 and i > 0: print(self.model_name + ' fold index: ' + str(fold_index) + ' ' + str(i)) out = [] for id in predict_dict: output_mat = predict_dict[id] / 5.0 output_mat[output_mat > 0.5] = 1 output_mat[output_mat <= 0.5] = 0 output_mat = output_mat.astype(np.uint8) out.append([id, output_mat]) submission = create_submission(out, 768, 768) submission.to_csv(self.model_name + '_' + str(self.pretrained_model) + '_5fold.csv', index=None)
def _predict(pipeline_name, dev_mode): logger.info('PREDICTION') logger.info('reading data...') if dev_mode: logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE)) application_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) else: application_test = pd.read_csv(params.test_filepath) data = {'input': {'X': application_test, 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() logger.info('Start pipeline transform') output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['clipped_prediction'] if not dev_mode: logger.info('creating submission file...') submission = create_submission(application_test, y_pred) logger.info('verifying submission...') sample_submission = pd.read_csv(params.sample_submission_filepath) verify_submission(submission, sample_submission) submission_filepath = os.path.join(params.experiment_directory, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission persisted to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head())) if params.kaggle_api: logger.info('making Kaggle submit...') os.system('kaggle competitions submit -c home-credit-default-risk -f {} -m {}' .format(submission_filepath, params.kaggle_message))
def _predict(pipeline_name, dev_mode): logger.info('reading data in') if dev_mode: meta_test = pd.read_csv(params.test_filepath, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference'], nrows=cfg.DEV_SAMPLE_TEST_SIZE) else: meta_test = pd.read_csv(params.test_filepath, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference']) meta_test['click_time'] = pd.to_datetime(meta_test['click_time'], format='%Y-%m-%d %H:%M:%S') data_hash_channel_send(ctx, 'Test Data Hash', meta_test) data = { 'input': { 'X': meta_test[cfg.FEATURE_COLUMNS], 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['y_pred'] logger.info('creating submission') submission = create_submission(meta_test, y_pred) submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head()))
def _predict(pipeline_name, dev_mode): logger.info('reading data in') if dev_mode: meta_test = pd.read_csv(params.test_filepath, nrows=cfg.DEV_SAMPLE_SIZE) else: meta_test = pd.read_csv(params.test_filepath) data = {'input': {'X': meta_test, 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['clipped_prediction'] logger.info('creating submission test') submission = create_submission(meta_test, y_pred) submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head()))
def _predict_in_chunks(pipeline_name, submit_predictions, dev_mode, chunk_size): meta = pd.read_csv(os.path.join(params.meta_dir, 'stage{}_metadata.csv'.format(params.competition_stage))) meta_test = meta[meta['is_test'] == 1] if dev_mode: meta_test = meta_test.sample(9, random_state=1234) logger.info('processing metadata of shape {}'.format(meta_test.shape)) submission_chunks = [] for meta_chunk in generate_data_frame_chunks(meta_test, chunk_size): data = {'input': {'meta': meta_chunk, 'meta_valid': None, 'train_mode': False, 'target_sizes': [(300, 300)] * len(meta_chunk) }, } pipeline = PIPELINES[pipeline_name]['inference'](SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['y_pred'] submission_chunk = create_submission(meta_chunk, y_pred, logger, CATEGORY_IDS) submission_chunks.extend(submission_chunk) submission_filepath = os.path.join(params.experiment_dir, 'submission.json') submission = submission_chunks with open(submission_filepath, "w") as fp: fp.write(json.dumps(submission)) logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission[0])) if submit_predictions: _make_submission(submission_filepath)
def main(): #%% Reading data train = pd.read_csv('./data/cs-training.csv', index_col=0) X = train.drop('SeriousDlqin2yrs', axis=1) y = train.SeriousDlqin2yrs #Feature Engineering print(f'Starting shape: {X.shape}') X = (X.pipe(utils.replace_w_sensible_values).pipe(utils.replace_na).pipe( utils.log_transform_df).pipe(utils.add_AgeDecade). pipe(utils.add_boolean_DebtRatio_33).pipe( utils.add_boolean_DebtRatio_43).pipe( utils.add_features_per_dependent).pipe( utils.add_features_per_creditline).pipe( utils.add_features_per_estate).pipe( utils.add_features_distance_from_mean).pipe( utils.add_features_distance_from_median).pipe( utils.add_features_distance_from_std)) print(f'Post feature engineering shape:{X.shape}') # %% Train Test Split X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE, stratify=y) # %% LightGBM Classifier lgb_model = lgb.LGBMClassifier( silent=False, random_state=RANDOM_STATE, objective='binary', metrics='auc', boosting='gbdt', scale_pos_weight=13.960106382978724 #T/P-1 ) # %% Stratified Kfold parameters skf = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE) # %% Tuning Parameters for RandomSearch tuning_params = { 'num_leaves': [5, 10, 15, 31, 40, 50], 'scale_pos_weight': [1, 10, 14, 16], # T/P-1 = 13.96 'n_estimators': [100, 250, 500, 750, 1000], 'learning_rate': [0.025, 0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'min_child_weight': range(3, 6, 1), 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)], 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100] } gs = RandomizedSearchCV(estimator=lgb_model, param_distributions=tuning_params, n_iter=50, scoring='roc_auc', cv=skf, refit=True, verbose=True) gs.fit(X_train, y_train) # %% Extraction and selection of final features impt_features = utils.get_feature_importance(gs, X_train) impt_features.to_csv(f'output/{RANDOM_PREFIX}_impt_features.csv', index=False) final_features = impt_features[ impt_features['importance'] > 0].feature.values X_train = X_train.loc[:, final_features] X_test = X_test.loc[:, final_features] # %% Final Model best_lgb = lgb.LGBMClassifier().set_params(**gs.best_params_) best_lgb.fit(X_train, y_train) # %% Calculating model performance, plotting AUC and PRC curves utils.calculate_model_performance(best_lgb, X_train, X_test, y_train, y_test, RANDOM_PREFIX) # %% Create Submission for Kaggle utils.create_submission(best_lgb, final_features, RANDOM_PREFIX)
def main(args): # 1. Prepare data & models # individual transformations for validation and test data did not give not reduce MSE-loss train_transforms = transforms.Compose([ ScaleMinSideToSize((SCALE_SIZE, SCALE_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.Grayscale(3), ("image", )), # grayscale image for best score TransformByKeys(transforms.ColorJitter(brightness=[0.8, 1.2]), ("image", )), # random choose brightness in range TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ("image", )), ]) print('----------------------------------------------------------') print('Script for Kaggle competition "Thousand Facial Landmarks"') print('----------------------------------------------------------') print('Reading data...') train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="train") train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=0, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'), train_transforms, split="val") val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=0, pin_memory=True, shuffle=False, drop_last=False) print("Creating model...") device = torch.device( "cuda: 0" ) # default GPU device, because train this net on CPU is eternity :) # this network was selected through experimentation from the list: resnet18, resnet34, resnext50, resnext101, alexnet, InceptionV3, InceptionV4 etc model = models.resnext50_32x4d(pretrained=True) # adding new layers with regularization (dropout or batchnorm) did not give effect model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True) loss_fn = fnn.mse_loss # 2. Train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss_fn, optimizer, device=device) val_loss = validate(model, val_dataloader, loss_fn, device=device) print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format( epoch, train_loss, val_loss)) if val_loss < best_val_loss: best_val_loss = val_loss with open(f"{args.name}_best.pth", "wb") as fp: torch.save(model.state_dict(), fp) # 3. Predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'), train_transforms, split="test") test_dataloader = data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=0, pin_memory=True, shuffle=False, drop_last=False) with open(f"{args.name}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(f"{args.name}_test_predictions.pkl", "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, f"{args.name}_submit.csv")
dtrain=x_train, num_boost_round=132, #early_stopping_rounds=4, verbose_eval=1) by_test = bst.predict(x_test) # C cx_train, cy_train, cx_test, c_idxs = get_data('c') x_train = xgb.DMatrix(cx_train, label=cy_train) x_test = xgb.DMatrix(cx_test) neg_pos_rate = np.sum(cy_train == 0) / np.sum(cy_train == 1) params = { 'max_depth': 2, 'eta': 0.1, # learning rate # 'scale_pos_weight': neg_pos_rate, # Balance classes ? 'silent': 1, 'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': ['logloss'] } bst = xgb.train( params=params, dtrain=x_train, num_boost_round=142, #early_stopping_rounds=10, verbose_eval=1) cy_test = bst.predict(x_test) # Create submission create_submission(ay_test, by_test, cy_test, a_idxs, b_idxs, c_idxs)
import models, utils, datasets, predict import logging, sys logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) scales = {'Chicago': (0.78, 0.88, 1), 'Chicago RAC': (0.6, 0.8, 0.1), 'New Haven': (0.89, 1, 0.68), 'New Haven RAC': (1, 1, 1), 'Oakland': (0.84, 1, 0.51), 'Oakland RAC': (1, 0.94, 0.23), 'Richmond': (0.64, 1, 1), 'Richmond RAC': (1, 1, 1)} if __name__=='__main__': if len(sys.argv) == 1: print "No model name was given! Run again using format: \n\t", print "python test.py modelname" else: modelname = sys.argv[1] pred = models.test_model(modelname) categories = datasets.load_dataset('Categories') n_pred = pred.shape[0] pred = predict.apply_scales(pred, categories[-n_pred:], scales) name = modelname + ".csv" utils.create_submission(name, pred) print "Saved submission with name %s" %(name)
def main(args): os.makedirs("runs", exist_ok=True) # 1. prepare data & models train_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]), ("image", )), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"), train_transforms, split="train") train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"), train_transforms, split="val") val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) device = torch.device( "cuda:0" ) # if args.gpu and torch.cuda.is_available() else torch.device("cpu") print("Creating model...") model = models.resnet18(pretrained=True) model.requires_grad_(False) model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) model.fc.requires_grad_(True) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True) loss_fn = fnn.mse_loss time.sleep(60) # 2. train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss_fn, optimizer, device=device) val_loss = validate(model, val_dataloader, loss_fn, device=device) print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format( epoch, train_loss, val_loss)) if val_loss < best_val_loss: best_val_loss = val_loss with open(os.path.join("runs", f"{args.name}_best.pth"), "wb") as fp: torch.save(model.state_dict(), fp) # 3. predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, "test"), train_transforms, split="test") test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) with open(os.path.join("runs", f"{args.name}_best.pth"), "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(os.path.join("runs", f"{args.name}_test_predictions.pkl"), "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, os.path.join("runs", f"{args.name}_submit.csv"))
# oof_prediction[val_] = la.predict(val_X) oof_prediction[val_] = light_gbm.transform(val_X)['prediction'] oof_prediction[oof_prediction < 0] = 0 # _preds = la.predict(test_X) _preds = light_gbm.transform(test_X)['prediction'] _preds[_preds < 0 ] = 0 sub_prediction += np.expm1(_preds) / len(folds) oof_scores.append(mean_squared_error(TARGET[val_], oof_prediction[val_])**0.5) print('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1])) gc.collect() # Lasso la = linear_model.Lasso() TARGET = pd.DataFrame(TARGET) for fold_, (trn_, val_) in enumerate(folds): trn_X, trn_y = train_X.iloc[trn_], TARGET.iloc[trn_] val_X, val_y = train_X.iloc[val_], TARGET.iloc[val_] la.fit(trn_X, trn_y) oof_prediction[val_] = la.predict(val_X) oof_prediction[oof_prediction < 0] = 0 _preds = la.predict(test_X) _preds[_preds < 0 ] = 0 sub_prediction += np.expm1(_preds) / len(folds) oof_scores.append(mean_squared_error(TARGET.iloc[val_], oof_prediction[val_])**0.5) print('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1])) gc.collect() submission = utils.create_submission(sub_prediction, test) submission.to_csv("first_trial.csv", index=False)
prediction_2 = predict2(image_fps_val, min_conf=0.92, augment=False) def merge_predictions(prediction, prediction_2): prediction_3 = copy.deepcopy(prediction) for patient_id in list(prediction_2.keys()): if len(prediction_2[patient_id])>0: prediction_3[patient_id] = [] return prediction_3 prediction_3 = merge_predictions(prediction, prediction_2) iou_all_mean,tp,fp,tn,fn = iou(truth, prediction_3) print(iou_all_mean,tp,fp,tn,fn) # 0.21805178140096618 248 235 948 69 # Prepare prediction set on test data if True: image_fps_test = get_image_fps(TEST_DIR) image_fps_test.sort() prediction_test = predict(image_fps_test, min_conf=0.96, augment=True) prediction_test_2 = predict2(image_fps_test, min_conf=0.92, augment=False) prediction_test_3 = merge_predictions(prediction_test, prediction_test_2) create_submission(prediction_test_3) submission = pd.read_csv('prediction.csv')
def main(args): # 1. prepare data & models train_transforms = transforms.Compose([ ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ("image", ), ), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset( os.path.join(args.data, "train"), train_transforms, split="train", debug=args.debug, ) train_dataloader = data.DataLoader( train_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=True, drop_last=True, ) val_dataset = ThousandLandmarksDataset( os.path.join(args.data, "train"), train_transforms, split="val", debug=args.debug, ) val_dataloader = data.DataLoader( val_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False, ) print("Creating model...") device = torch.device("cuda: 0") if args.gpu else torch.device("cpu") model = models.resnet50(pretrained=True) model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) model.to(device) for name, child in model.named_children(): if name in ["fc"]: for param in child.parameters(): param.requires_grad = True else: for param in child.parameters(): param.requires_grad = False optimizer = optim.SGD( filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate, momentum=0.9, weight_decay=1e-04, ) scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=0.1, steps_per_epoch=len(train_dataloader), epochs=args.epochs) loss = L.WingLoss(width=10, curvature=2, reduction="mean") # 2. train & validate print("Ready for training...") for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss, optimizer, device=device, scheduler=scheduler) val_loss = validate(model, val_dataloader, loss, device=device) print("Epoch #{:2}:\ttrain loss: {:6.3}\tval loss: {:6.3}".format( epoch, train_loss, val_loss)) # 2.1. train continued for p in model.parameters(): p.requires_grad = True optimizer = optim.AdamW( [ { "params": model.conv1.parameters(), "lr": 1e-6 }, { "params": model.bn1.parameters(), "lr": 1e-6 }, { "params": model.relu.parameters(), "lr": 1e-5 }, { "params": model.maxpool.parameters(), "lr": 1e-5 }, { "params": model.layer1.parameters(), "lr": 1e-4 }, { "params": model.layer2.parameters(), "lr": 1e-4 }, { "params": model.layer3.parameters(), "lr": 1e-3 }, { "params": model.layer4.parameters(), "lr": 1e-3 }, { "params": model.avgpool.parameters(), "lr": 1e-2 }, { "params": model.fc.parameters(), "lr": 1e-2 }, ], lr=args.learning_rate, weight_decay=1e-06, amsgrad=True, ) scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2) print("Ready for training again...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, loss, optimizer, device=device, scheduler=scheduler) val_loss = validate(model, val_dataloader, loss, device=device) print("Epoch #{:2}:\ttrain loss: {:6.3}\tval loss: {:6.3}".format( epoch, train_loss, val_loss)) if val_loss < best_val_loss: best_val_loss = val_loss with open(f"{args.name}_best.pth", "wb") as fp: torch.save(model.state_dict(), fp) # 3. predict if not args.debug: test_dataset = ThousandLandmarksDataset( os.path.join(args.data, "test"), train_transforms, split="test", debug=args.debug, ) test_dataloader = data.DataLoader( test_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False, ) with open(f"submit/{args.name}_best.pth", "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(f"submit/{args.name}_test_predictions.pkl", "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions, }, fp, ) create_submission(args.data, test_predictions, f"submit/{args.name}_submit.csv")
print('Dropping Id columns in data...', end='') Xtrain = Xtrain.drop(['Id'], axis=1) Ytrain = Ytrain.drop(Ytrain.columns[0], axis=1) Xtest_id = Xtest['Id'] #To save in sumission file Xtest = Xtest.drop(['Id'], axis=1) print('Done') # print('Xtrain', Xtrain) # print('ytrain', Ytrain.values.ravel()) # print('Xtest', Xtest) log = {} print('Training with gradient_boosting_tree_model...', end='') Ytest, gbt_log = gradient_boosting_tree_model(Xtrain, Ytrain, Xtest) log['gbt_log'] = gbt_log create_submission(Xtest_id, Ytest, 'gradient_boosting_tree_model') print('Done') print('Training with random_forest_model...', end='') Ytest, rf_log = random_forest_model(Xtrain, Ytrain, Xtest) log['rf_log'] = rf_log create_submission(Xtest_id, Ytest, 'random_forest_model') print('Done') print('Training with extra_trees_model...', end='') Ytest, et_log = extra_trees_model(Xtrain, Ytrain, Xtest) log['et_log'] = et_log create_submission(Xtest_id, Ytest, 'extra_trees_model') print('Done') print('Training with xgboost_model...', end='')
def main(args): os.makedirs("runs", exist_ok=True) # 1. prepare data & models # train_transforms = transforms.Compose([ # ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), # CropCenter(CROP_SIZE), # TransformByKeys(transforms.ToPILImage(), ("image",)), # TransformByKeys(transforms.ToTensor(), ("image",)), # TransformByKeys(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ("image",)), # (0.485, 0.456, 0.406), (0.229, 0.224, 0.225) # ]) crop_size = (224, 224) train_transforms = transforms.Compose([ CropFrame(9), ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), FlipHorizontal(), Rotator(30), # CropRectangle(crop_size), ChangeBrightnessContrast(alpha_std=0.05, beta_std=10), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ("image", )), ]) valid_transforms = transforms.Compose([ CropFrame(9), ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)), CropCenter(CROP_SIZE), # CropRectangle(crop_size), TransformByKeys(transforms.ToPILImage(), ("image", )), TransformByKeys(transforms.ToTensor(), ("image", )), TransformByKeys( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ("image", )), ]) print("Reading data...") train_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"), train_transforms, split="train") train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=True, drop_last=True) val_dataset = ThousandLandmarksDataset(os.path.join(args.data, "train"), valid_transforms, split="val") val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) device = torch.device("cuda:0") if args.gpu and torch.cuda.is_available( ) else torch.device("cpu") print("Creating model...") # model = models.resnext50_32x4d(pretrained=True) # model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True) # checkpoint = torch.load("./runs/baseline_full3_best.pth", map_location='cpu') # model.load_state_dict(checkpoint, strict=True) model = RESNEXT_steroid() model.to(device) for p in model.base_net.parameters(): p.requires_grad = False # model.base_net[8].requires_grad = True for p in model.fc.parameters(): p.requires_grad = True for p in model.linear7.parameters(): p.requires_grad = True for p in model.attention.parameters(): p.requires_grad = True for p in model.linear1.parameters(): p.requires_grad = True # model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True) # criterion = AdaptiveWingLoss() # criterion = torch.nn.MSELoss(size_average=True) # loss_fn = fnn.mse_loss criterion = fnn.l1_loss lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=1 / np.sqrt(10), patience=4, verbose=True, threshold=0.01, threshold_mode='abs', cooldown=0, min_lr=1e-6, eps=1e-08) # 2. train & validate print("Ready for training...") best_val_loss = np.inf for epoch in range(args.epochs): train_loss = train(model, train_dataloader, criterion, optimizer, device=device) val_loss, mse_loss = validate(model, val_dataloader, criterion, device=device) lr_scheduler.step(val_loss) print( "Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tmse loss: {:5.2}" .format(epoch, train_loss, val_loss, mse_loss)) if val_loss < best_val_loss: best_val_loss = val_loss with open(os.path.join("runs", f"{args.name}_best.pth"), "wb") as fp: torch.save(model.state_dict(), fp) # 3. predict test_dataset = ThousandLandmarksDataset(os.path.join(args.data, "test"), train_transforms, split="test") test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True, shuffle=False, drop_last=False) with open(os.path.join("runs", f"{args.name}_best.pth"), "rb") as fp: best_state_dict = torch.load(fp, map_location="cpu") model.load_state_dict(best_state_dict) test_predictions = predict(model, test_dataloader, device) with open(os.path.join("runs", f"{args.name}_test_predictions.pkl"), "wb") as fp: pickle.dump( { "image_names": test_dataset.image_names, "landmarks": test_predictions }, fp) create_submission(args.data, test_predictions, os.path.join("runs", f"{args.name}_submit.csv"))
import models, utils, datasets, predict import logging, sys logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) scales = { 'Chicago': (0.78, 0.88, 1), 'Chicago RAC': (0.6, 0.8, 0.1), 'New Haven': (0.89, 1, 0.68), 'New Haven RAC': (1, 1, 1), 'Oakland': (0.84, 1, 0.51), 'Oakland RAC': (1, 0.94, 0.23), 'Richmond': (0.64, 1, 1), 'Richmond RAC': (1, 1, 1) } if __name__ == '__main__': if len(sys.argv) == 1: print "No model name was given! Run again using format: \n\t", print "python test.py modelname" else: modelname = sys.argv[1] pred = models.test_model(modelname) categories = datasets.load_dataset('Categories') n_pred = pred.shape[0] pred = predict.apply_scales(pred, categories[-n_pred:], scales) name = modelname + ".csv" utils.create_submission(name, pred) print "Saved submission with name %s" % (name)
datetime.utcnow().strftime('%Y-%m-%d_%H%M%S') logr.info('writing fit {} pipeline to disk as {}'.format(job, model_name)) try: joblib.dump(pipeline, os.path.join('saved_models', model_name) + '.pkl', compress=3) except OverflowError, e: # this is annoying; look into it later logr.warn('joblib write failed with error={}'.format(e)) logr.info('proceeding with predictions without writing model to disk') # do something useful with the fit model if args.submission: # make predictions for a leaderboard submission logr.info('writing predictions to formatted submission file') utils.create_submission(predictions, pipeline_detail['name'], comment=pipeline_detail['note']) else: # if we already did CV through the gridsearch, then just take # the best score and make predictions if hasattr(pipeline, 'best_params_'): logr.info('predicting test values with best-choice gridsearch params') predictions = pipeline.predict(X_test) # fake an array of CV scores to play nice with plot formatting later scores = np.array([pipeline.best_score_]) # otherwise, do some cross validation else: # otherwise, run a cross-validation for test accuracy cv = 3 logr.info('cross validating model predictions with cv={}'.format(cv)) predictions = cross_val_predict(pipeline, X_test, y_test, cv=cv)
def _predict(pipeline_name, dev_mode): logger.info('reading data in') if dev_mode: TEST_DAYS_HOURS = cfg.DEV_TEST_DAYS_HOURS else: TEST_DAYS_HOURS = eval(params.test_days_hours) meta_test_suplement = read_csv_time_chunks( params.test_chunks_dir, prefix='test', days_hours=TEST_DAYS_HOURS, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference'], logger=logger) meta_test = pd.read_csv(params.test_filepath, usecols=cfg.FEATURE_COLUMNS + cfg.ID_COLUMN, dtype=cfg.COLUMN_TYPES['inference']) meta_test_full = pd.concat([meta_test_suplement, meta_test], axis=0).reset_index(drop=True) meta_test_full.drop_duplicates(subset=cfg.ID_COLUMN, keep='last', inplace=True) meta_test_full['click_time'] = pd.to_datetime(meta_test_full['click_time'], format='%Y-%m-%d %H:%M:%S') data_hash_channel_send(ctx, 'Test Data Hash', meta_test_full) if dev_mode: meta_test_full = meta_test_full.sample(cfg.DEV_SAMPLE_TEST_SIZE, replace=False) data = { 'input': { 'X': meta_test_full[cfg.FEATURE_COLUMNS], 'y': None, }, } pipeline = PIPELINES[pipeline_name]['inference'](cfg.SOLUTION_CONFIG) pipeline.clean_cache() output = pipeline.transform(data) pipeline.clean_cache() y_pred = output['y_pred'] logger.info('creating submission full test') full_submission = create_submission(meta_test_full, y_pred) full_submission_filepath = os.path.join(params.experiment_dir, 'full_submission.csv') full_submission.to_csv(full_submission_filepath, index=None, encoding='utf-8') logger.info('subsetting submission') submission = pd.merge(full_submission, meta_test[cfg.ID_COLUMN], on=cfg.ID_COLUMN, how='inner') submission_filepath = os.path.join(params.experiment_dir, 'submission.csv') submission.to_csv(submission_filepath, index=None, encoding='utf-8') logger.info('submission saved to {}'.format(submission_filepath)) logger.info('submission head \n\n{}'.format(submission.head()))
def run(args): task = args['model'] submit = args['submission'] # 1.)Load data for training model X_train_full, y_train_full = utils.load_train_data(task) if submit: # making a submission; train on all given data print('fitting models to entire training set') X_train, y_train = X_train_full, y_train_full X_test = utils.load_test_data(task) else: # running an experiment - cross validate with train/test split test_size = args['test_size'] print('fitting models to cv train/test split with train% = {}'.format(1-test_size)) X_train, X_val, y_train, y_val = train_test_split(X_train_full,y_train_full, test_size=test_size, random_state=args['random_state']) # 2.) Get pipeline if task == 'Visit': pipeline_detail = visit[args['expt']] X_train, y_train = utils.sample_negatives(X_train, y_train, 2) if not submit: X_val, y_val = utils.sample_negatives(X_val, y_val, 1) else: pipeline_detail = rating[args['expt']] pipeline = pipeline_detail['pl'] # Fit model to training data print('fitting model to array sizes (xtrain, ytrain)={}'.format([i.shape for i in [X_train, y_train]])) print('fitting experiment pipeline with signature={}'.format(pipeline)) pipeline.fit(X_train, y_train) # 3.) For non-submission experiments, get the best parameters from grid search if submit: fname_spec = '_submission_' else: # log all results + call out the winner if hasattr(pipeline, 'best_params_'): print('best gridsearch score={}'.format(pipeline.best_score_)) print('best set of pipeline params={}'.format(pipeline.best_params_)) print('now displaying all pipeline param scores...') cv_results = pipeline.cv_results_ for params, mean_score, scores in list(zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score'])): print("{:0.3f} (+/-{:0.03f}) for {}".format(mean_score, scores.std() * 2, params)) fname_spec = '_expt_' model_name = utils.short_name(pipeline) + fname_spec + datetime.utcnow().strftime('%Y-%m-%d_%H%M%S') # 4.) Prepare submission if submit: print('writing predictions to formatted submission file') predictions = pipeline.predict(X_test) if hasattr(pipeline, 'best_params_'): print('predicting test values with best-choice gridsearch params') utils.create_submission(predictions, pipeline_detail['name'], X_test) else: cv = args['k-fold'] print('cross validating model predictions with cv={}'.format(cv)) predictions = cross_val_predict(pipeline, X_val, y_val, cv=cv) # print("cross val prediction", accuracy_score(y_val, predictions)) print("cross val prediction", mean_squared_error(y_val, predictions)) predictions_train = pipeline.predict(X_train) predictions_test = pipeline.predict(X_val) if task == 'Visit': print('obtained train accuracy = {:.2f}, test accuracy = {:.2f} pipeline={} '.format( accuracy_score(y_train, predictions_train), accuracy_score(y_val, predictions_test), pipeline)) print('calculating confusion matrix') try: cf = confusion_matrix(y_val, predictions) print("confusion matrix: ", cf) sb.heatmap(cf) except RuntimeError as e: print('plotting error. matplotlib backend may need to be changed (see readme). error={}'.format(e)) print('plot may still have been saved, and model has already been saved to disk.') else: print('obtained train mse = {:.2f} test mse={}, pipeline={} '.format( mean_squared_error(y_train, predictions_train), mean_squared_error(y_val, predictions_test), pipeline)) if args['cross_val_score']: # this gives a better idea of uncertainty, but it adds 'cv' more print('cross validating model accuracy with cv={}'.format(cv)) scores = cross_val_score(pipeline, X_val, y_val, cv=cv) print('obtained accuracy={:0.2f}% +/- {:0.2f} with cv={}, \ pipeline={} '.format(scores.mean() * 100, scores.std() * 100 * 2, cv, pipeline)) print('completed with pipeline {}'.format(pipeline_detail['name']))