def __init__(self, settings): super(Trainer, self).__init__() self.settings = settings self.phase = settings.cmd self.batch_size = settings.batch_size self.data_dir = settings.data_dir self.list_dir = settings.list_dir self.checkpoint = settings.resume self.load_checkpoint = (len(self.checkpoint) > 0) self.num_epochs = settings.num_epochs self.lr = float(settings.lr) self.save = settings.save_on or settings.out_dir self.from_pause = self.settings.continu self.path_ctrl = settings.global_path self.path = self.path_ctrl.get_path log_dir = '' if settings.log_off else self.path_ctrl.get_dir('log') self.logger = Logger(scrn=True, log_dir=log_dir, phase=self.phase) for k, v in sorted(settings.__dict__.items()): self.logger.show("{}: {}".format(k, v)) self.start_epoch = 0 self._init_max_acc = 0.0 self.model = None self.criterion = None
def set_gpc_and_logger(args): gpc = OutPathGetter(root=os.path.join(args.exp_dir, args.tag), suffix=args.suffix) log_dir = '' if args.log_off else gpc.get_dir('log') logger = Logger(scrn=True, log_dir=log_dir, phase=args.cmd) register('GPC', gpc) register('LOGGER', logger) return gpc, logger
def __init__(self, data_dir, ckp_path, save_lr=False, list_dir='', out_dir='./', log_dir=''): super(Predictor, self).__init__() self.data_dir = data_dir self.list_dir = list_dir self.out_dir = out_dir self.checkpoint = ckp_path self.save_lr = save_lr self.logger = Logger(scrn=True, log_dir=log_dir, phase='test') self.model = None
def __init__(self, model=None, mode='folder', save_dir=None, scrn=True, log_dir=None, cuda_off=False): self.save_dir = save_dir self.output = None if not cuda_off and torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') assert model is not None, "The model must be assigned" self.model = self._model_init(model) if mode not in Predictor.modes: raise NotImplementedError self.logger = Logger(scrn=scrn, log_dir=log_dir, phase='predict') if mode == 'dataloader': self._predict = partial(self._predict_dataloader, dataloader=None, save_dir=save_dir) elif mode == 'folder': # self.suffix = ['.jpg', '.png', '.bmp', '.gif', '.npy'] # 支持的图像格式 self._predict = partial(self._predict_folder, save_dir=save_dir) elif mode == 'list': self._predict = partial(self._predict_list, save_dir=save_dir) elif mode == 'file': self._predict = partial(self._predict_file, save_dir=save_dir) elif mode == 'data': self._predict = partial(self._predict_data, save_dir=save_dir) else: raise NotImplementedError
#!/usr/bin/python # -*- coding: UTF-8 -*- """ # @Time : 2019/9/9 9:55 # @Author : peng.wang # @Email : [email protected] # @FileName: model.py # @ProjectName :Facility_Location_FangTai """ from gurobipy import * from utils.misc import Logger import pandas as pd # define the log file log = Logger(log_path='../log').logger # define the facility location problem YEAR_DAY = 365 class FacilityLocation(object): """ this class is consist of attributes for problem construction some utils function for dealing with post-process one key function for building the detail model """ def __init__(self, data, config): """ :param data: class of data provide all data used
import pandas as pd import numpy as np from sklearn.base import BaseEstimator from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel from sklearn.svm import SVR from sklearn.linear_model import Lasso, Ridge from sklearn.model_selection import GridSearchCV # from utils.misc import Logger from utils.util import generate_cutoffs from utils.misc import save_model_to_file, mean_abs_percentage_error, xgb_mape log = Logger(log_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'log')).logger warnings.filterwarnings("ignore") xgb_installed = False lgt_installed = False try: import xgboost as xgb xgb_installed = True import lightgbm as lgt lgt_installed = True except ImportError: pass class TrainModel(BaseEstimator):
RESTORE_FROM = "/data/AutoPheno/green/200527/PatchNet/snapshots-fb/LEAF_UNET_B0064_S010700.pth" SAVE_PRED_EVERY = 1000 SNAPSHOT_DIR = root_dir + 'PatchNet/snapshots'+postfix IMGSHOT_DIR = root_dir + 'PatchNet/imgshots'+postfix WEIGHT_DECAY = 0.0005 NUM_EXAMPLES_PER_EPOCH = 13862 NUM_STEPS_PER_EPOCH = math.ceil(NUM_EXAMPLES_PER_EPOCH / float(BATCH_SIZE)) MAX_ITER = max(NUM_EXAMPLES_PER_EPOCH * MAX_EPOCH + 1, NUM_STEPS_PER_EPOCH * BATCH_SIZE * MAX_EPOCH + 1) if not os.path.exists(SNAPSHOT_DIR): os.makedirs(SNAPSHOT_DIR) if not os.path.exists(IMGSHOT_DIR): os.makedirs(IMGSHOT_DIR) LOG_PATH = SNAPSHOT_DIR + "/B"+format(BATCH_SIZE, "04d")+"E"+format(MAX_EPOCH, "04d")+".log" sys.stdout = Logger(LOG_PATH, sys.stdout) print(DATA_LIST_PATH) print("num of epoch:", MAX_EPOCH) print("RESTORE_FROM:", RESTORE_FROM) print(NUM_EXAMPLES_PER_EPOCH) def get_arguments(): """Parse all the arguments provided from the CLI. Returns: A list of parsed arguments. """ parser = argparse.ArgumentParser(description="UNet Network") parser.add_argument("--set-start", default=False) parser.add_argument("--start-step", default=0, type=int)
def run(try_num, config): output_dir = f'./dae-out-{try_num}' if not os.path.exists(output_dir): os.mkdir(output_dir) args = get_args() train_features = pd.read_csv('../input/lish-moa/train_features.csv') test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features.loc[:500] config.update(dict(n_epochs=3, n_folds=2)) all_features = pd.concat([train_features, test_features]).reset_index(drop=True) g_features_columns = [ col for col in all_features.columns if col.startswith('g-') ] c_features_columns = [ col for col in all_features.columns if col.startswith('c-') ] feature_columns = g_features_columns + c_features_columns n_features = len(feature_columns) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) logger = Logger() for fold_index, (train_idx, valid_idx) in enumerate( kfold.split(all_features.values, all_features.values)): print('Fold: ', fold_index + 1, flush=True) x_train = all_features.loc[train_idx] x_valid = all_features.loc[valid_idx] model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=10) best_score = np.inf for epoch in range(config.n_epochs): dataset = DaeDataset(x_train, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True) train_loss = loop_train(model, criterion, dataloader, optimizer) dataset = DaeDataset(x_valid, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, criterion, dataloader) scheduler.step(valid_loss) logger.update({ 'fold': fold_index, 'epoch': epoch + 1, 'train_loss': train_loss, 'val_loss': valid_loss }) print( f'epoch {epoch + 1}/{config.n_epochs} - train_loss: {train_loss:.5f} - ' + f'valid_loss: {valid_loss:.5f}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{output_dir}/dae_fold_weight_{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break logger.save(f'./{output_dir}/dae_log.csv') oof_preds = [] for fold_index in range(config.n_folds): model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) model.load_state_dict( torch.load(f'./{output_dir}/dae_fold_weight_{fold_index}.pt')) model.eval() dataset = DaeDataset(all_features, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) loss, preds = loop_valid(model, nn.MSELoss(), dataloader) logger.update({'fold': fold_index, 'val_loss': loss}) print('Evaluation fold: {} - valid_loss: {:.5f}'.format( fold_index, loss), flush=True) oof_preds.append(preds) print('A Whole Evaluation Score: {:.5f}'.format( mean_squared_error(all_features.loc[:, feature_columns].values, np.mean(oof_preds, axis=0))), flush=True) # for i, preds in enumerate(oof_preds): # create_pred_feature_df(preds, all_features).to_csv(f'./{output_dir}/dae_features_{i}.csv', index=False) create_pred_feature_df(np.mean(oof_preds, axis=0), all_features).to_csv( f'./{output_dir}/dae_features_mean.csv', index=False)
References ---------- [1] Narula, S. & Weistroffer, H. A flexible method for nonlinear multicriteria decision-making problems Systems, Man and Cybernetics, IEEE Transactions on, 1989 , 19 , 883-887. ''' import sys,os example_path=os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(example_path,"..")) from utils.misc import Logger sys.stdout = Logger(os.path.splitext(os.path.basename(__file__))[0]) from utils import tui from method.NAUTILUS import NAUTILUSv1,ENAUTILUS from optimization.OptimizationMethod import PointSearch from problem.Problem import PreGeneratedProblem if __name__ == '__main__': # SciPy breaks box constraints method = ENAUTILUS(PreGeneratedProblem(filename=os.path.join(example_path,"AuxiliaryServices.csv")), PointSearch) zh=tui.iter_enautilus(method) ci=method.current_iter
# @Author : peng.wang # @Email : [email protected] # @FileName: util.py # @ProjectName :Facility_Location_FangTai """ import pandas as pd import os import xlwings as xw from utils.misc import Logger from pyecharts import options as opts from pyecharts.charts import Geo from pyecharts.globals import ChartType, SymbolType # define the log file RAW_DATA_PATH = os.path.dirname(os.path.dirname(__file__)) log = Logger(log_path=os.path.join(RAW_DATA_PATH, 'log')).logger class DataHandler(object): """ """ def __init__(self, file, config): """ file:文件名 :param file: :param config """ # 读数路径 self._PATH = os.path.join(os.path.join(RAW_DATA_PATH, 'data'), file) self._config = config
def run(try_num, config): args = get_args() print('args', args, flush=True) print('config:', config.to_dict(), flush=True) set_seed(config.rand_seed) pretrained_model = f"tf_efficientnet_b3_ns" model_dir = f'deepinsight-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv(f"../input/lish-moa/train_features.csv") train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv") test_features = pd.read_csv(f"../input/lish-moa/test_features.csv") if config.dae_path: dae_features = pd.read_csv(config.dae_path) if args.debug: train_features = train_features.iloc[:500] train_targets = train_targets.iloc[:500] if config.dae_path: dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( kfolds=3, n_epoch=3 )) train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) cat_features_columns = ["cp_dose", 'cp_time'] num_feature_columns = [c for c in train_features.columns if c != "sig_id" and c not in cat_features_columns + ['cp_type']] all_features_columns = cat_features_columns + num_feature_columns target_columns = [c for c in train_targets.columns if c != "sig_id"] g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")] c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")] if config.dae_path: if config.dae_strategy == 'replace': train_features, test_features = assign_dae_features( train_features, test_features, dae_features, len(num_feature_columns)) else: train_features, test_features, dae_feature_columns = merge_dae_features( train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns)) all_features_columns += dae_feature_columns train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) if config.normalizer == 'rank': train_features, test_features = normalize(train_features, test_features, num_feature_columns) for df in [train_features, test_features]: df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1}) df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1}) df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1}) if config.variance_target_type == 1: pickle_path = f'{model_dir}/variance_reduction.pkl' variance_target_features = num_feature_columns if config.dae_path and config.dae_strategy != 'replace': variance_target_features += dae_feature_columns if not os.path.exists(pickle_path): vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold) save_pickle(vt, pickle_path) vt = load_pickle(pickle_path) train_features = variance_reduction_transform(vt, train_features, variance_target_features) test_features = variance_reduction_transform(vt, test_features, variance_target_features) print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True) all_features_columns = list(train_features.columns[1:]) skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed) y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist() logger = Logger() for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): if args.only_pred: print('Skip training', flush=True) break print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_train = train_features.loc[train_index, all_features_columns].copy().values y_train = train_targets.iloc[train_index, 1:].copy().values X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values if config.normalizer == 'log': scaler = LogScaler() if config.norm_apply_all: scaler.fit(X_train) X_train = scaler.transform(X_train) X_valid = scaler.transform(X_valid) else: target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns] non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns] scaler.fit(X_train[:, target_features]) X_train_tr = scaler.transform(X_train[:, target_features]) X_valid_tr = scaler.transform(X_valid[:, target_features]) X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1) X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1) save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl') transformer = DeepInsightTransformer( feature_extractor=config.extractor, pixels=config.resolution, perplexity=config.perplexity, random_state=config.rand_seed, n_jobs=-1 ).fit(X_train) save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) if config.smoothing is not None: if config.weighted_loss_weights is not None: indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] train_loss_function = SmoothBCEwLogits( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=len(target_columns)) else: train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) else: train_loss_function = bce_loss eval_loss_function = bce_loss optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) if config.scheduler_type == 'ca': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1) elif config.scheduler_type == 'ms': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1) else: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=7) best_score = np.inf start_time = time.time() for epoch in range(config.n_epoch): if config.swap_enable: dataset = MoAImageSwapDataset( X_train, y_train, transformer, image_size=config.image_size, swap_prob=config.swap_prob, swap_portion=config.swap_portion) else: dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=False) loss = loop_train(model, train_loss_function, dataloader, optimizer) if config.scheduler_type == 'rp': scheduler.step(loss) else: scheduler.step() for param_group in optimizer.param_groups: print('current learning rate:', param_group['lr']) del dataset, dataloader dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) del dataset, dataloader logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss}) print(f'epoch {epoch + 1}/{config.n_epoch} - train_loss: {loss:.5f} - ' + f'valid_loss: {valid_loss:.5f} - elapsed: {time_format(time.time() - start_time)}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break print(f'Done -> Fold {fold_index}/{config.kfolds} - best_valid_loss: {best_score:.5f} - ' + f'elapsed: {time_format(time.time() - start_time)}', flush=True) torch.cuda.empty_cache() gc.collect() if args.return_first_fold: logger.save(f'{model_dir}/log.csv') return test_preds = np.zeros((test_features.shape[0], len(target_columns))) start_time = time.time() print('Start infarence', flush=True) oof_preds = np.zeros((len(train_features), len(target_columns))) eval_loss_function = bce_loss for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values X_test = test_features[all_features_columns].values if config.normalizer == 'log': scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl') X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt')) dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) print(f'Fold {fold_index}/{config.kfolds} - fold_valid_loss: {valid_loss:.5f}', flush=True) logger.update({'fold': fold_index, 'val_loss': valid_loss}) oof_preds[val_index, :] = valid_preds dataset = TestDataset(X_test, None, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) preds = loop_preds(model, dataloader) test_preds += preds / config.kfolds oof_preds_df = train_targets.copy() oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1) oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False) oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds) print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True) print(f'Done infarence Elapsed {time_format(time.time() - start_time)}', flush=True) logger.update({'fold': 'oof', 'val_loss': oof_loss}) logger.save(f'{model_dir}/log.csv') submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id']) submission = submission.reindex(columns=['sig_id'] + target_columns) submission.loc[:, target_columns] = test_preds.clip(0, 1) submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def run(try_num, config): logger = Logger() args = get_args() print('config:', config.to_dict(), flush=True) print('args:', args, flush=True) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' model_dir = f'blending-01-nn-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv') dae_features = pd.read_csv(config.dae_path) test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features[:500] train_targets = train_targets[:500] dae_features = pd.concat( [dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update( dict( n_folds=3, seeds=[222], n_epochs=3, batch_size=128, )) target_columns = [col for col in train_targets.columns if col != 'sig_id'] n_targets = len(target_columns) train_features, train_targets, test_features = preprocess( config, model_dir, train_features, train_targets, test_features, dae_features) features_columns = [ col for col in train_features.columns if col not in [ 'sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle', 'cp_type_trt_cp' ] ] metric_loss_function = nn.BCELoss() if config.weighted_loss_strategy == 1: indices = get_minority_target_index( train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] smooth_loss_function = SmoothBCELoss( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=n_targets) else: smooth_loss_function = SmoothBCELoss(smoothing=config.smoothing) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) for seed_index, seed in enumerate(config.seeds): if args.only_pred: print('Skip training', flush=True) break print(f'Train seed {seed}', flush=True) set_seed(seed) for fold_index, (train_indices, val_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): print(f'Train fold {fold_index + 1}', flush=True) x_train = train_features.loc[train_indices, features_columns] y_train = train_targets.loc[train_indices, target_columns] x_val = train_features.loc[val_indices, features_columns] y_val = train_targets.loc[val_indices, target_columns] model = new_model(config.model_kind, len(features_columns)).to(DEVICE) checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) best_loss = np.inf for epoch in range(config.n_epochs): dataset = MoaDataset(x_train.values, y_train.values) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) train_loss = loop_train(model, dataloader, optimizer, loss_functions=( smooth_loss_function, metric_loss_function, )) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, dataloader, metric_loss_function) print( 'Epoch {}/{} - loss: {:5.5f} - val_loss: {:5.5f}'. format(epoch + 1, config.n_epochs, train_loss, valid_loss), flush=True) logger.update({ 'epoch': epoch + 1, 'loss': train_loss, 'val_loss': valid_loss }) scheduler.step(valid_loss) if valid_loss < best_loss: best_loss = valid_loss torch.save(model.state_dict(), checkpoint_path) oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets)) test_preds = np.zeros((len(test_features), n_targets)) for seed_index in range(len(config.seeds)): seed = config.seeds[seed_index] print(f'Inference for seed {seed}', flush=True) _test_preds_in_seed = np.zeros((len(test_features), n_targets)) for fold_index, (_, valid_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): x_val = train_features.loc[valid_indices, features_columns] y_val = train_targets.loc[valid_indices, target_columns] checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' model = new_model(config.model_kind, len(features_columns)).to(DEVICE) model.load_state_dict(torch.load(checkpoint_path)) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) oof_preds[valid_indices, seed_index, :] = preds dataset = MoaDataset(test_features[features_columns].values, None) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) _test_preds_in_seed += preds / config.n_folds score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds[:, seed_index, :], n_targets=n_targets) test_preds += _test_preds_in_seed / len(config.seeds) print(f'Score for this seed {score:5.5f}', flush=True) logger.update({'val_loss': score}) # Evalucate validation score oof_preds = np.mean(oof_preds, axis=1) score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds, n_targets=n_targets) print(f'Overall score is {score:5.5f}', flush=True) # Save validation prediction oof_pred_df = train_targets.copy() oof_pred_df.iloc[:, 1:] = oof_preds oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False) # Save log logger.update({'val_loss': score}) logger.save(f'{model_dir}/log.csv') # Save Test Prediction test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = create_submission(test_features, ['sig_id'] + target_columns) submission[target_columns] = test_preds submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
#!/usr/bin/python # -*- coding: UTF-8 -*- """ # @Time : 2019/10/8 22:34 # @Author : peng.wang # @Email : [email protected] # @FileName: util.py # @ProjectName :sh-demand-forecast-alg """ import pandas as pd import numpy as np import requests import json import os from utils.misc import Logger log = Logger(log_path='log').logger class DataLoader(object): """ Data loader. Combines a dataset and a sampler, and provides an iterable over the given dataset. """ def __init__(self, data, train_len, pred_len, feature_names, target_name, append_train=False): self.data = data