def run_training(fold_number): seed_everything(config.seed) device = torch.device(config.device) # read csv data_frame = read_csv(config.train_csv) # create stratify kfold df_folds = kfold(data_frame) # create dataset train_dataset = WheatDataset( image_ids=df_folds[df_folds['fold'] != fold_number].index.values, data_frame=data_frame, transforms=get_train_transforms(), test=False, ) validation_dataset = WheatDataset( image_ids=df_folds[df_folds['fold'] == fold_number].index.values, data_frame=data_frame, transforms=get_valid_transforms(), test=True, ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.batch_size, sampler=RandomSampler(train_dataset), pin_memory=False, drop_last=True, num_workers=config.num_workers, collate_fn=collate_fn, ) val_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=False, sampler=SequentialSampler(validation_dataset), pin_memory=False, collate_fn=collate_fn, ) # model model = get_net() if len(config.gpu_ids) > 1: model = nn.DataParallel(model) model.to(device) # training trainer = Trainner(model=model, config=config, fold_number=fold_number) trainer.train(train_loader, val_loader)
def train(): path_to_data = '../../data/processed/' path_to_output = '../../data/submissions/' path_to_preds = '../../data/predictions/' version = '1.1' random_seed = 8675309 sample_size = 50000 n_folds = 5 params = { 'nthread': 8, 'n_estimators': 10000, 'learning_rate': 0.02, 'num_leaves': 34, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 39.3259775, 'silent': -1, 'verbose': -1 } train, labels, test, train_ids, test_ids = utils.load_features( path_to_data, version, sample_size) oof_train, oof_test = utils.kfold(classifier_builder=LightGBMWrapper, base_classifier=lightgbm.LGBMClassifier, classifier_params=params, train=train, labels=labels, test=test, n_folds=n_folds, random_seed=random_seed, use_smote=True) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'lightgbm': oof_train }) # df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) # df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-lightgbm.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-lightgbm.csv', index=False)
def train(): path_to_data = '../../data/processed/' path_to_output = '../../data/submissions/' path_to_preds = '../../data/predictions/' version = '1.3' random_seed = 8675309 sample_size = None n_folds = 5 xgb_params = { 'learning_rate':0.1, 'n_estimators':10000, 'max_depth':4, 'min_child_weight':5, 'subsample':0.8, 'colsample_bytree':0.8, 'objective':'binary:logistic', 'nthread':8, 'seed':random_seed, 'scale_pos_weight':2.5, 'reg_alpha':1.2, 'early_stopping_rounds':50, 'verbose':20, 'eval_metric':'auc' } train, labels, test, train_ids, test_ids = utils.load_features(path_to_data, version, sample_size) oof_train, oof_test = utils.kfold(classifier_builder=XgboostWrapper, base_classifier=XGBClassifier, classifier_params=xgb_params, train=train, labels=labels, test=test, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({'SK_ID_CURR':train_ids, 'TARGET':labels, 'xgboost':oof_train}) df_oof_train.fillna(0, inplace=True) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR':test_ids, 'TARGET':oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-xgboost.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-xgboost.csv', index=False)
############################################################ # k-fold train/test ############################################################ training_start_time = time.time() timestamp = str(training_start_time) running_score = [] plateaued = False try: print('Performing {} iteration(s) of training.'.format(config.ITERATIONS)) for iteration in range(config.ITERATIONS): if plateaued is True: break print('Starting iteration #{}'.format(iteration)) for d, v_in, v_out, t_in, t_out in utils.kfold(input_data, output_data, config.K_FOLDS): print('Performing k-fold #{:02}'.format(d)) if config.TRAIN_MODEL is True: if config.RANDOMIZE_INPUT_DATA is True: tf_log_dir = os.path.join( config.TENSORBOARD_LOG_DATA, 'rand') elif config.CONTIGUOUS_INPUT_DATA is True: tf_log_dir = os.path.join( config.TENSORBOARD_LOG_DATA, 'cont') else: tf_log_dir = os.path.join( config.TENSORBOARD_LOG_DATA, 'step') tf_log_dir = os.path.join(
def train(): path_to_data = '../../data/processed/' path_to_output = '../../data/submissions/' path_to_preds = '../../data/predictions/' version = '1.3' random_seed = 8675309 sample_size = None n_folds = 5 rf_params = {'n_jobs': -1, 'n_estimators': 100} lr_params = {'C': 0.001} et_params = {} nb_params = {} train, labels, test, train_ids, test_ids = utils.load_features( path_to_data, version, sample_size) train_df = train.fillna(0) train_df.replace(np.inf, 0, inplace=True) train_df.replace(-np.inf, 0, inplace=True) test_df = test.fillna(0) test_df.replace(np.inf, 0, inplace=True) test_df.replace(-np.inf, 0, inplace=True) # ------------------------------------------------------------------------ # Start training models. # ------------------------------------------------------------------------ # Start with RandomForest oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper, base_classifier=RandomForestClassifier, classifier_params=rf_params, train=train_df, labels=labels, test=test_df, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'random-forest': oof_train }) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-random-forest.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-random-forest.csv', index=False) del oof_test, oof_train, df_oof_test, df_oof_train # Extra trees oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper, base_classifier=ExtraTreesClassifier, classifier_params=et_params, train=train_df, labels=labels, test=test_df, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'extra-trees': oof_train }) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-extra-trees.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-extra-trees.csv', index=False) del oof_test, oof_train, df_oof_test, df_oof_train # Naive Bayes oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper, base_classifier=GaussianNB, classifier_params=nb_params, train=train_df, labels=labels, test=test_df, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'naive-bayes': oof_train }) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-naive-bayes.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-naive-bayes.csv', index=False) # Logistic Regression oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper, base_classifier=LogisticRegression, classifier_params=lr_params, train=train_df, labels=labels, test=test_df, n_folds=n_folds, random_seed=random_seed) df_oof_train = pd.DataFrame({ 'SK_ID_CURR': train_ids, 'TARGET': labels, 'logistic-regression': oof_train }) df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32') df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test}) df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32') df_oof_train.to_csv(path_to_preds + version + '-logistic-regression.csv', index=False) df_oof_test.to_csv(path_to_output + version + '-logistic-regression.csv', index=False)