def __init__(self, settings): super(Trainer, self).__init__() self.settings = settings self.phase = settings.cmd self.batch_size = settings.batch_size self.data_dir = settings.data_dir self.list_dir = settings.list_dir self.checkpoint = settings.resume self.load_checkpoint = (len(self.checkpoint) > 0) self.num_epochs = settings.num_epochs self.lr = float(settings.lr) self.save = settings.save_on or settings.out_dir self.from_pause = self.settings.continu self.path_ctrl = settings.global_path self.path = self.path_ctrl.get_path log_dir = '' if settings.log_off else self.path_ctrl.get_dir('log') self.logger = Logger(scrn=True, log_dir=log_dir, phase=self.phase) for k, v in sorted(settings.__dict__.items()): self.logger.show("{}: {}".format(k, v)) self.start_epoch = 0 self._init_max_acc = 0.0 self.model = None self.criterion = None
def __init__(self, data_dir, ckp_path, save_lr=False, list_dir='', out_dir='./', log_dir=''): super(Predictor, self).__init__() self.data_dir = data_dir self.list_dir = list_dir self.out_dir = out_dir self.checkpoint = ckp_path self.save_lr = save_lr self.logger = Logger(scrn=True, log_dir=log_dir, phase='test') self.model = None
def set_gpc_and_logger(args): gpc = OutPathGetter(root=os.path.join(args.exp_dir, args.tag), suffix=args.suffix) log_dir = '' if args.log_off else gpc.get_dir('log') logger = Logger(scrn=True, log_dir=log_dir, phase=args.cmd) register('GPC', gpc) register('LOGGER', logger) return gpc, logger
def __init__(self, model=None, mode='folder', save_dir=None, scrn=True, log_dir=None, cuda_off=False): self.save_dir = save_dir self.output = None if not cuda_off and torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') assert model is not None, "The model must be assigned" self.model = self._model_init(model) if mode not in Predictor.modes: raise NotImplementedError self.logger = Logger(scrn=scrn, log_dir=log_dir, phase='predict') if mode == 'dataloader': self._predict = partial(self._predict_dataloader, dataloader=None, save_dir=save_dir) elif mode == 'folder': # self.suffix = ['.jpg', '.png', '.bmp', '.gif', '.npy'] # 支持的图像格式 self._predict = partial(self._predict_folder, save_dir=save_dir) elif mode == 'list': self._predict = partial(self._predict_list, save_dir=save_dir) elif mode == 'file': self._predict = partial(self._predict_file, save_dir=save_dir) elif mode == 'data': self._predict = partial(self._predict_data, save_dir=save_dir) else: raise NotImplementedError
# @FileName: main_wl.py # @ProjectName :Facility_Location_WL """ from utils.util import DataHandler from core.complex_model import FacilityLocation from core.model import FacilityLocation from core.cutoff_model import FacilityLocation # from core.model import FacilityLocation # from core.model_2 import FacilityLocation from core.model_3 import FacilityLocation from utils.misc import Logger import pandas as pd import os # define the log file and log level log = Logger(log_path='./log').logger # define the configuration parameters class Config(object): """ define all parameters """ # TODO: tune these capacity of cdc and rdc rdc_capacity = 5000000000000 num_rdc = 1 num_cdc = 4 P_c = 0.95 P_b = 0.8 rr_cdc = 0.00 weight_avg = 100
class Predictor: modes = ['dataloader', 'folder', 'list', 'file', 'data'] def __init__(self, model=None, mode='folder', save_dir=None, scrn=True, log_dir=None, cuda_off=False): self.save_dir = save_dir self.output = None if not cuda_off and torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') assert model is not None, "The model must be assigned" self.model = self._model_init(model) if mode not in Predictor.modes: raise NotImplementedError self.logger = Logger(scrn=scrn, log_dir=log_dir, phase='predict') if mode == 'dataloader': self._predict = partial(self._predict_dataloader, dataloader=None, save_dir=save_dir) elif mode == 'folder': # self.suffix = ['.jpg', '.png', '.bmp', '.gif', '.npy'] # 支持的图像格式 self._predict = partial(self._predict_folder, save_dir=save_dir) elif mode == 'list': self._predict = partial(self._predict_list, save_dir=save_dir) elif mode == 'file': self._predict = partial(self._predict_file, save_dir=save_dir) elif mode == 'data': self._predict = partial(self._predict_data, save_dir=save_dir) else: raise NotImplementedError def __call__(self, *args, **kwargs): return self._predict(*args, **kwargs) def _model_init(self, model): model.to(self.device) model.eval() return model def _load_data(self, path): return io.imread(path) def _to_tensor(self, arr): return to_tensor(arr) def _to_array(self, tensor): return to_array(tensor) def _normalize(self, tensor): return normalize(tensor) def _np2tensor(self, arr): nor_tensor = self._normalize(self._to_tensor(arr)) assert isinstance(nor_tensor, torch.Tensor) return nor_tensor def _save_data_NTIRE2020(self, data, path): s_dir = os.path.dirname(path) if not os.path.exists(s_dir): os.mkdir(s_dir) path = path.replace('_clean.png', '.mat').replace('_RealWorld.png', '.mat') if isinstance(data, torch.Tensor): data = self._to_array(data).squeeze() content = {} content['cube'] = data content['bands'] = np.array([[ 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700 ]]) # content['norm_factor'] = hdf5.write(data=content, filename=path, store_python_metadata=True, matlab_compatible=True) def _save_data(self, data, path): s_dir = os.path.dirname(path) if not os.path.exists(s_dir): os.mkdir(s_dir) torchvision.utils.save_image(data, path) def predict_base(self, model, data, path=None): start = time.time() with torch.no_grad(): output = model(data) torch.cuda.synchronize() su_time = time.time() - start if path: self._save_data_NTIRE2020(output, path) self.output = output return output, su_time def _predict_dataloader(self, dataloader, save_dir=None): assert dataloader is not None, \ "In 'dataloader' mode the input must be a valid dataloader!" consume_time = AverageMeter() pb = tqdm(dataloader) for idx, (name, data) in enumerate(pb): assert isinstance(data, torch.Tensor) and data.dim() == 4,\ "input data must be 4-dimention tensor" data = data.to(self.device) # 4-d tensor save_path = os.path.join(save_dir, name) if save_dir else None _, su_time = self.predict_base(self.model, data, path=save_path) consume_time.update(su_time, n=1) # logger description = ( "[{}/{}] speed: {time.val:.4f}s({time.avg:.4f}s)".format( idx + 1, len(dataloader.dataset), time=consume_time)) pb.set_description(description) self.logger.dump(description) def _predict_folder(self, folder, save_dir=None): assert folder is not None and os.path.isdir(folder),\ "In 'folder' mode the input must be a valid path of a folder!" consume_time = AverageMeter() file_list = glob.glob(os.path.join(folder, '*')) assert not len(file_list) == 0, "The input folder is empty" pb = tqdm(file_list) # processbar for idx, file in enumerate(pb): img = self._load_data(file) name = os.path.basename(file) img = self._np2tensor(img).unsqueeze(0).to(self.device) save_path = os.path.join(save_dir, name) if save_dir else None _, su_time = self.predict_base(model=self.model, data=img, path=save_path) consume_time.update(su_time) # logger description = ( "[{}/{}] speed: {time.val:.4f}s({time.avg:.4f}s)".format( idx + 1, len(file_list), time=consume_time)) pb.set_description(description) self.logger.dump(description) def _predict_list(self, file_list, save_dir=None): assert isinstance(file_list, list),\ "In 'list' mode the input must be a valid file_path list!" consume_time = AverageMeter() assert not len(file_list) == 0, "The input file list is empty!" pb = tqdm(file_list) # processbar for idx, path in enumerate(pb): data = self._load_data(path) name = os.path.basename(path) data = self._np2tensor(data).unsqueeze(0).to(self.device) path = os.path.join(save_dir, name) if save_dir else None _, su_time = self.predict_base(model=self.model, data=data, path=path) consume_time.update(su_time, n=1) # logger description = ( "[{}/{}] speed: {time.val:.4f}s({time.avg:.4f}s)".format( idx + 1, len(file_list), time=consume_time)) pb.set_description(description) self.logger.dump(description) def _predict_file(self, file_path, save_dir=None): assert isinstance(file_path, str) and os.path.isfile(file_path), \ "In 'file' mode the input must a valid path of a file!" consume_time = AverageMeter() data = self._load_data(file_path) name = os.path.basename(file_path) data = self._np2tensor(data).unsqueeze(0).to(self.device) path = os.path.join(save_dir, name) if save_dir else None _, su_time = self.predict_base(model=self.model, data=data, path=path) consume_time.update(su_time) # logger description = ("file: {} speed: {time.val:.4f}s".format( name, time=consume_time)) self.logger.show(description) def _predict_data(self, data): """ :return: tensor """ assert isinstance(data, torch.Tensor) and data.dim() == 4, \ "In 'data' mode the input must be a 4-d tensor" consume_time = AverageMeter() output, su_time = self.predict_base(model=self.model, data=data) consume_time.update(su_time) # logger description = ("speed: {time.val:.4f}s".format(time=consume_time)) self.logger.dump(description) return output
RESTORE_FROM = "/data/AutoPheno/green/200527/PatchNet/snapshots-fb/LEAF_UNET_B0064_S010700.pth" SAVE_PRED_EVERY = 1000 SNAPSHOT_DIR = root_dir + 'PatchNet/snapshots'+postfix IMGSHOT_DIR = root_dir + 'PatchNet/imgshots'+postfix WEIGHT_DECAY = 0.0005 NUM_EXAMPLES_PER_EPOCH = 13862 NUM_STEPS_PER_EPOCH = math.ceil(NUM_EXAMPLES_PER_EPOCH / float(BATCH_SIZE)) MAX_ITER = max(NUM_EXAMPLES_PER_EPOCH * MAX_EPOCH + 1, NUM_STEPS_PER_EPOCH * BATCH_SIZE * MAX_EPOCH + 1) if not os.path.exists(SNAPSHOT_DIR): os.makedirs(SNAPSHOT_DIR) if not os.path.exists(IMGSHOT_DIR): os.makedirs(IMGSHOT_DIR) LOG_PATH = SNAPSHOT_DIR + "/B"+format(BATCH_SIZE, "04d")+"E"+format(MAX_EPOCH, "04d")+".log" sys.stdout = Logger(LOG_PATH, sys.stdout) print(DATA_LIST_PATH) print("num of epoch:", MAX_EPOCH) print("RESTORE_FROM:", RESTORE_FROM) print(NUM_EXAMPLES_PER_EPOCH) def get_arguments(): """Parse all the arguments provided from the CLI. Returns: A list of parsed arguments. """ parser = argparse.ArgumentParser(description="UNet Network") parser.add_argument("--set-start", default=False) parser.add_argument("--start-step", default=0, type=int)
def run(try_num, config): output_dir = f'./dae-out-{try_num}' if not os.path.exists(output_dir): os.mkdir(output_dir) args = get_args() train_features = pd.read_csv('../input/lish-moa/train_features.csv') test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features.loc[:500] config.update(dict(n_epochs=3, n_folds=2)) all_features = pd.concat([train_features, test_features]).reset_index(drop=True) g_features_columns = [ col for col in all_features.columns if col.startswith('g-') ] c_features_columns = [ col for col in all_features.columns if col.startswith('c-') ] feature_columns = g_features_columns + c_features_columns n_features = len(feature_columns) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) logger = Logger() for fold_index, (train_idx, valid_idx) in enumerate( kfold.split(all_features.values, all_features.values)): print('Fold: ', fold_index + 1, flush=True) x_train = all_features.loc[train_idx] x_valid = all_features.loc[valid_idx] model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=10) best_score = np.inf for epoch in range(config.n_epochs): dataset = DaeDataset(x_train, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True) train_loss = loop_train(model, criterion, dataloader, optimizer) dataset = DaeDataset(x_valid, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, criterion, dataloader) scheduler.step(valid_loss) logger.update({ 'fold': fold_index, 'epoch': epoch + 1, 'train_loss': train_loss, 'val_loss': valid_loss }) print( f'epoch {epoch + 1}/{config.n_epochs} - train_loss: {train_loss:.5f} - ' + f'valid_loss: {valid_loss:.5f}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{output_dir}/dae_fold_weight_{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break logger.save(f'./{output_dir}/dae_log.csv') oof_preds = [] for fold_index in range(config.n_folds): model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) model.load_state_dict( torch.load(f'./{output_dir}/dae_fold_weight_{fold_index}.pt')) model.eval() dataset = DaeDataset(all_features, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) loss, preds = loop_valid(model, nn.MSELoss(), dataloader) logger.update({'fold': fold_index, 'val_loss': loss}) print('Evaluation fold: {} - valid_loss: {:.5f}'.format( fold_index, loss), flush=True) oof_preds.append(preds) print('A Whole Evaluation Score: {:.5f}'.format( mean_squared_error(all_features.loc[:, feature_columns].values, np.mean(oof_preds, axis=0))), flush=True) # for i, preds in enumerate(oof_preds): # create_pred_feature_df(preds, all_features).to_csv(f'./{output_dir}/dae_features_{i}.csv', index=False) create_pred_feature_df(np.mean(oof_preds, axis=0), all_features).to_csv( f'./{output_dir}/dae_features_mean.csv', index=False)
References ---------- [1] Narula, S. & Weistroffer, H. A flexible method for nonlinear multicriteria decision-making problems Systems, Man and Cybernetics, IEEE Transactions on, 1989 , 19 , 883-887. ''' import sys,os example_path=os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(example_path,"..")) from utils.misc import Logger sys.stdout = Logger(os.path.splitext(os.path.basename(__file__))[0]) from utils import tui from method.NAUTILUS import NAUTILUSv1,ENAUTILUS from optimization.OptimizationMethod import PointSearch from problem.Problem import PreGeneratedProblem if __name__ == '__main__': # SciPy breaks box constraints method = ENAUTILUS(PreGeneratedProblem(filename=os.path.join(example_path,"AuxiliaryServices.csv")), PointSearch) zh=tui.iter_enautilus(method) ci=method.current_iter
# -*- coding: UTF-8 -*- """ # @Time : 2020/11/17 18:57 # @Author : peng.wang # @Email : [email protected] # @FileName: main.py # @ProjectName :Prediction_Optimization """ import os import json from utils.util import DataHandler from utils.misc import Logger from core.model import Scheduler # define the log file and log level log = Logger(log_path='./log').logger # define the configuration parameters class Config(object): Dates = [] if not os.path.exists('results'): os.mkdir('results') # load the data filename = "data_input.xlsx" data_ins = DataHandler(file=filename, config=Config) mode = 'deterministic' mode = 'expected'
class Trainer: def __init__(self, settings): super(Trainer, self).__init__() self.settings = settings self.phase = settings.cmd self.batch_size = settings.batch_size self.data_dir = settings.data_dir self.list_dir = settings.list_dir self.checkpoint = settings.resume self.load_checkpoint = (len(self.checkpoint) > 0) self.num_epochs = settings.num_epochs self.lr = float(settings.lr) self.save = settings.save_on or settings.out_dir self.from_pause = self.settings.continu self.path_ctrl = settings.global_path self.path = self.path_ctrl.get_path log_dir = '' if settings.log_off else self.path_ctrl.get_dir('log') self.logger = Logger(scrn=True, log_dir=log_dir, phase=self.phase) for k, v in sorted(settings.__dict__.items()): self.logger.show("{}: {}".format(k, v)) self.start_epoch = 0 self._init_max_acc = 0.0 self.model = None self.criterion = None def train_epoch(self): raise NotImplementedError def validate_epoch(self, epoch, store): raise NotImplementedError def train(self): cudnn.benchmark = True if self.load_checkpoint: self._resume_from_checkpoint() max_acc = self._init_max_acc best_epoch = self.get_ckp_epoch() self.model.cuda() self.criterion.cuda() end_epoch = self.num_epochs if self.from_pause else self.start_epoch + self.num_epochs for epoch in range(self.start_epoch, end_epoch): lr = self._adjust_learning_rate(epoch) self.logger.show_nl("Epoch: [{0}]\tlr {1:.06f}".format(epoch, lr)) # Train for one epoch self.train_epoch() # Evaluate the model on validation set self.logger.show_nl("Validate") acc = self.validate_epoch(epoch=epoch, store=self.save) is_best = acc > max_acc if is_best: max_acc = acc best_epoch = epoch self.logger.show_nl( "Current: {:.6f} ({:03d})\tBest: {:.6f} ({:03d})\t".format( acc, epoch, max_acc, best_epoch)) # The checkpoint saves next epoch self._save_checkpoint(self.model.state_dict(), max_acc, epoch + 1, is_best) def validate(self): if self.checkpoint: if self._resume_from_checkpoint(): self.model.cuda() self.criterion.cuda() self.validate_epoch(self.get_ckp_epoch(), self.save) else: self.logger.warning("no checkpoint assigned!") def _load_pretrained(self): raise NotImplementedError def _adjust_learning_rate(self, epoch): # Note that this does not take effect for separate learning rates start_epoch = 0 if self.from_pause else self.start_epoch if self.settings.lr_mode == 'step': lr = self.lr * (0.5**((epoch - start_epoch) // self.settings.step)) elif self.settings.lr_mode == 'poly': lr = self.lr * (1 - (epoch - start_epoch) / (self.num_epochs - start_epoch))**1.1 elif self.settings.lr_mode == 'const': lr = self.lr else: raise ValueError('unknown lr mode {}'.format( self.settings.lr_mode)) if lr == self.lr: return self.lr for param_group in self.optimizer.param_groups: param_group['lr'] = lr return lr def _resume_from_checkpoint(self): if not os.path.isfile(self.checkpoint): self.logger.error("=> no checkpoint found at '{}'".format( self.checkpoint)) return False self.logger.show("=> loading checkpoint '{}'".format(self.checkpoint)) checkpoint = torch.load(self.checkpoint) state_dict = self.model.state_dict() ckp_dict = checkpoint.get('state_dict', checkpoint) update_dict = { k: v for k, v in ckp_dict.items() if k in state_dict and state_dict[k].shape == v.shape } num_to_update = len(update_dict) if (num_to_update < len(state_dict)) or (len(state_dict) < len(ckp_dict)): if self.phase == 'val': self.logger.error("=> mismatched checkpoint for validation") return False self.logger.warning( "warning: trying to load an mismatched checkpoint") if num_to_update == 0: self.logger.error("=> no parameter is to be loaded") return False else: self.logger.warning( "=> {} params are to be loaded".format(num_to_update)) elif (not self.settings.anew) or (self.phase != 'train'): # Note in the non-anew mode, it is not guaranteed that the contained field # max_acc be the corresponding one of the loaded checkpoint. self.start_epoch = checkpoint.get('epoch', self.start_epoch) self._init_max_acc = checkpoint.get('max_acc', self._init_max_acc) state_dict.update(update_dict) self.model.load_state_dict(state_dict) self.logger.show( "=> loaded checkpoint '{}' (epoch {}, max_acc {:.4f})".format( self.checkpoint, self.get_ckp_epoch(), self._init_max_acc)) return True def _save_checkpoint(self, state_dict, max_acc, epoch, is_best): state = {'epoch': epoch, 'state_dict': state_dict, 'max_acc': max_acc} # Save history history_path = self.path('weight', CKP_COUNTED.format(e=epoch, s=self.scale), underline=True) if (epoch - self.start_epoch) % self.settings.trace_freq == 0: torch.save(state, history_path) # Save latest latest_path = self.path('weight', CKP_LATEST.format(s=self.scale), underline=True) torch.save(state, latest_path) if is_best: shutil.copyfile( latest_path, self.path('weight', CKP_BEST.format(s=self.scale), underline=True)) def get_ckp_epoch(self): # Get current epoch of the checkpoint # For dismatched ckp or no ckp, set to 0 return max(self.start_epoch - 1, 0)
class Predictor: def __init__(self, data_dir, ckp_path, save_lr=False, list_dir='', out_dir='./', log_dir=''): super(Predictor, self).__init__() self.data_dir = data_dir self.list_dir = list_dir self.out_dir = out_dir self.checkpoint = ckp_path self.save_lr = save_lr self.logger = Logger(scrn=True, log_dir=log_dir, phase='test') self.model = None def test_epoch(self): raise NotImplementedError def test(self): if self.checkpoint: if self._resume_from_checkpoint(): self.model.cuda() self.model.eval() self.test_epoch() else: self.logger.warning("no checkpoint assigned!") def _resume_from_checkpoint(self): if not os.path.isfile(self.checkpoint): self.logger.error("=> no checkpoint found at '{}'".format( self.checkpoint)) return False self.logger.show("=> loading checkpoint '{}'".format(self.checkpoint)) checkpoint = torch.load(self.checkpoint) state_dict = self.model.state_dict() ckp_dict = checkpoint.get('state_dict', checkpoint) try: state_dict.update(ckp_dict) self.model.load_state_dict(ckp_dict) except KeyError as e: self.logger.error("=> mismatched checkpoint for test") self.logger.error(e) return False else: self.epoch = checkpoint.get('epoch', 0) self.logger.show("=> loaded checkpoint '{}'".format(self.checkpoint)) return True
# @Author : peng.wang # @Email : [email protected] # @FileName: util.py # @ProjectName :Facility_Location_FangTai """ import pandas as pd import os import xlwings as xw from utils.misc import Logger from pyecharts import options as opts from pyecharts.charts import Geo from pyecharts.globals import ChartType, SymbolType # define the log file RAW_DATA_PATH = os.path.dirname(os.path.dirname(__file__)) log = Logger(log_path=os.path.join(RAW_DATA_PATH, 'log')).logger class DataHandler(object): """ """ def __init__(self, file, config): """ file:文件名 :param file: :param config """ # 读数路径 self._PATH = os.path.join(os.path.join(RAW_DATA_PATH, 'data'), file) self._config = config
def run(try_num, config): args = get_args() print('args', args, flush=True) print('config:', config.to_dict(), flush=True) set_seed(config.rand_seed) pretrained_model = f"tf_efficientnet_b3_ns" model_dir = f'deepinsight-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv(f"../input/lish-moa/train_features.csv") train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv") test_features = pd.read_csv(f"../input/lish-moa/test_features.csv") if config.dae_path: dae_features = pd.read_csv(config.dae_path) if args.debug: train_features = train_features.iloc[:500] train_targets = train_targets.iloc[:500] if config.dae_path: dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( kfolds=3, n_epoch=3 )) train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) cat_features_columns = ["cp_dose", 'cp_time'] num_feature_columns = [c for c in train_features.columns if c != "sig_id" and c not in cat_features_columns + ['cp_type']] all_features_columns = cat_features_columns + num_feature_columns target_columns = [c for c in train_targets.columns if c != "sig_id"] g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")] c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")] if config.dae_path: if config.dae_strategy == 'replace': train_features, test_features = assign_dae_features( train_features, test_features, dae_features, len(num_feature_columns)) else: train_features, test_features, dae_feature_columns = merge_dae_features( train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns)) all_features_columns += dae_feature_columns train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) if config.normalizer == 'rank': train_features, test_features = normalize(train_features, test_features, num_feature_columns) for df in [train_features, test_features]: df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1}) df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1}) df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1}) if config.variance_target_type == 1: pickle_path = f'{model_dir}/variance_reduction.pkl' variance_target_features = num_feature_columns if config.dae_path and config.dae_strategy != 'replace': variance_target_features += dae_feature_columns if not os.path.exists(pickle_path): vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold) save_pickle(vt, pickle_path) vt = load_pickle(pickle_path) train_features = variance_reduction_transform(vt, train_features, variance_target_features) test_features = variance_reduction_transform(vt, test_features, variance_target_features) print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True) all_features_columns = list(train_features.columns[1:]) skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed) y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist() logger = Logger() for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): if args.only_pred: print('Skip training', flush=True) break print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_train = train_features.loc[train_index, all_features_columns].copy().values y_train = train_targets.iloc[train_index, 1:].copy().values X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values if config.normalizer == 'log': scaler = LogScaler() if config.norm_apply_all: scaler.fit(X_train) X_train = scaler.transform(X_train) X_valid = scaler.transform(X_valid) else: target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns] non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns] scaler.fit(X_train[:, target_features]) X_train_tr = scaler.transform(X_train[:, target_features]) X_valid_tr = scaler.transform(X_valid[:, target_features]) X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1) X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1) save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl') transformer = DeepInsightTransformer( feature_extractor=config.extractor, pixels=config.resolution, perplexity=config.perplexity, random_state=config.rand_seed, n_jobs=-1 ).fit(X_train) save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) if config.smoothing is not None: if config.weighted_loss_weights is not None: indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] train_loss_function = SmoothBCEwLogits( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=len(target_columns)) else: train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) else: train_loss_function = bce_loss eval_loss_function = bce_loss optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) if config.scheduler_type == 'ca': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1) elif config.scheduler_type == 'ms': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1) else: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=7) best_score = np.inf start_time = time.time() for epoch in range(config.n_epoch): if config.swap_enable: dataset = MoAImageSwapDataset( X_train, y_train, transformer, image_size=config.image_size, swap_prob=config.swap_prob, swap_portion=config.swap_portion) else: dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=False) loss = loop_train(model, train_loss_function, dataloader, optimizer) if config.scheduler_type == 'rp': scheduler.step(loss) else: scheduler.step() for param_group in optimizer.param_groups: print('current learning rate:', param_group['lr']) del dataset, dataloader dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) del dataset, dataloader logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss}) print(f'epoch {epoch + 1}/{config.n_epoch} - train_loss: {loss:.5f} - ' + f'valid_loss: {valid_loss:.5f} - elapsed: {time_format(time.time() - start_time)}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break print(f'Done -> Fold {fold_index}/{config.kfolds} - best_valid_loss: {best_score:.5f} - ' + f'elapsed: {time_format(time.time() - start_time)}', flush=True) torch.cuda.empty_cache() gc.collect() if args.return_first_fold: logger.save(f'{model_dir}/log.csv') return test_preds = np.zeros((test_features.shape[0], len(target_columns))) start_time = time.time() print('Start infarence', flush=True) oof_preds = np.zeros((len(train_features), len(target_columns))) eval_loss_function = bce_loss for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values X_test = test_features[all_features_columns].values if config.normalizer == 'log': scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl') X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt')) dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) print(f'Fold {fold_index}/{config.kfolds} - fold_valid_loss: {valid_loss:.5f}', flush=True) logger.update({'fold': fold_index, 'val_loss': valid_loss}) oof_preds[val_index, :] = valid_preds dataset = TestDataset(X_test, None, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) preds = loop_preds(model, dataloader) test_preds += preds / config.kfolds oof_preds_df = train_targets.copy() oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1) oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False) oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds) print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True) print(f'Done infarence Elapsed {time_format(time.time() - start_time)}', flush=True) logger.update({'fold': 'oof', 'val_loss': oof_loss}) logger.save(f'{model_dir}/log.csv') submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id']) submission = submission.reindex(columns=['sig_id'] + target_columns) submission.loc[:, target_columns] = test_preds.clip(0, 1) submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def run(try_num, config): logger = Logger() args = get_args() print('config:', config.to_dict(), flush=True) print('args:', args, flush=True) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' model_dir = f'blending-01-nn-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv') dae_features = pd.read_csv(config.dae_path) test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features[:500] train_targets = train_targets[:500] dae_features = pd.concat( [dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update( dict( n_folds=3, seeds=[222], n_epochs=3, batch_size=128, )) target_columns = [col for col in train_targets.columns if col != 'sig_id'] n_targets = len(target_columns) train_features, train_targets, test_features = preprocess( config, model_dir, train_features, train_targets, test_features, dae_features) features_columns = [ col for col in train_features.columns if col not in [ 'sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle', 'cp_type_trt_cp' ] ] metric_loss_function = nn.BCELoss() if config.weighted_loss_strategy == 1: indices = get_minority_target_index( train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] smooth_loss_function = SmoothBCELoss( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=n_targets) else: smooth_loss_function = SmoothBCELoss(smoothing=config.smoothing) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) for seed_index, seed in enumerate(config.seeds): if args.only_pred: print('Skip training', flush=True) break print(f'Train seed {seed}', flush=True) set_seed(seed) for fold_index, (train_indices, val_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): print(f'Train fold {fold_index + 1}', flush=True) x_train = train_features.loc[train_indices, features_columns] y_train = train_targets.loc[train_indices, target_columns] x_val = train_features.loc[val_indices, features_columns] y_val = train_targets.loc[val_indices, target_columns] model = new_model(config.model_kind, len(features_columns)).to(DEVICE) checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) best_loss = np.inf for epoch in range(config.n_epochs): dataset = MoaDataset(x_train.values, y_train.values) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) train_loss = loop_train(model, dataloader, optimizer, loss_functions=( smooth_loss_function, metric_loss_function, )) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, dataloader, metric_loss_function) print( 'Epoch {}/{} - loss: {:5.5f} - val_loss: {:5.5f}'. format(epoch + 1, config.n_epochs, train_loss, valid_loss), flush=True) logger.update({ 'epoch': epoch + 1, 'loss': train_loss, 'val_loss': valid_loss }) scheduler.step(valid_loss) if valid_loss < best_loss: best_loss = valid_loss torch.save(model.state_dict(), checkpoint_path) oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets)) test_preds = np.zeros((len(test_features), n_targets)) for seed_index in range(len(config.seeds)): seed = config.seeds[seed_index] print(f'Inference for seed {seed}', flush=True) _test_preds_in_seed = np.zeros((len(test_features), n_targets)) for fold_index, (_, valid_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): x_val = train_features.loc[valid_indices, features_columns] y_val = train_targets.loc[valid_indices, target_columns] checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' model = new_model(config.model_kind, len(features_columns)).to(DEVICE) model.load_state_dict(torch.load(checkpoint_path)) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) oof_preds[valid_indices, seed_index, :] = preds dataset = MoaDataset(test_features[features_columns].values, None) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) _test_preds_in_seed += preds / config.n_folds score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds[:, seed_index, :], n_targets=n_targets) test_preds += _test_preds_in_seed / len(config.seeds) print(f'Score for this seed {score:5.5f}', flush=True) logger.update({'val_loss': score}) # Evalucate validation score oof_preds = np.mean(oof_preds, axis=1) score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds, n_targets=n_targets) print(f'Overall score is {score:5.5f}', flush=True) # Save validation prediction oof_pred_df = train_targets.copy() oof_pred_df.iloc[:, 1:] = oof_preds oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False) # Save log logger.update({'val_loss': score}) logger.save(f'{model_dir}/log.csv') # Save Test Prediction test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = create_submission(test_features, ['sig_id'] + target_columns) submission[target_columns] = test_preds submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
import pandas as pd import numpy as np from sklearn.base import BaseEstimator from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel from sklearn.svm import SVR from sklearn.linear_model import Lasso, Ridge from sklearn.model_selection import GridSearchCV # from utils.misc import Logger from utils.util import generate_cutoffs from utils.misc import save_model_to_file, mean_abs_percentage_error, xgb_mape log = Logger(log_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'log')).logger warnings.filterwarnings("ignore") xgb_installed = False lgt_installed = False try: import xgboost as xgb xgb_installed = True import lightgbm as lgt lgt_installed = True except ImportError: pass class TrainModel(BaseEstimator):
#!/usr/bin/python # -*- coding: UTF-8 -*- """ # @Time : 2019/9/9 9:55 # @Author : peng.wang # @Email : [email protected] # @FileName: model.py # @ProjectName :Facility_Location_FangTai """ from gurobipy import * from utils.misc import Logger import pandas as pd # define the log file log = Logger(log_path='../log').logger # define the facility location problem YEAR_DAY = 365 class FacilityLocation(object): """ this class is consist of attributes for problem construction some utils function for dealing with post-process one key function for building the detail model """ def __init__(self, data, config): """ :param data: class of data provide all data used
#!/usr/bin/python # -*- coding: UTF-8 -*- """ # @Time : 2019/10/8 22:34 # @Author : peng.wang # @Email : [email protected] # @FileName: util.py # @ProjectName :sh-demand-forecast-alg """ import pandas as pd import numpy as np import requests import json import os from utils.misc import Logger log = Logger(log_path='log').logger class DataLoader(object): """ Data loader. Combines a dataset and a sampler, and provides an iterable over the given dataset. """ def __init__(self, data, train_len, pred_len, feature_names, target_name, append_train=False): self.data = data