def load_merge_set(path): def df_add_col_num(df, zfill_width=None): if zfill_width is None: zfill_width = 0 mapping = {} for idx, key in enumerate(df.keys()): mapping[key] = f'col_{str(idx).zfill(zfill_width)}_{key}' return df.rename(mapping, axis='columns') merged_path = path_join(path, 'merged.csv') if os.path.exists(merged_path): merged = pd.read_csv(merged_path) else: train_path = path_join(path, 'train.csv') train = pd.read_csv(train_path) test_path = path_join(path, 'test.csv') test = pd.read_csv(test_path) merged = pd.concat([train, test], axis=0) merged = df_add_col_num(merged, zfill_width=2) merged.to_csv(merged_path, index=False) return merged
def save_meta(self, path): setup_directory(path) self.metadata.save(path_join(path, 'meta.pkl')) self.metadata.save(path_join(path, 'meta.json')) self.params_path = path_join(path, 'params.pkl') self._save_params(self.params_path)
def __init__(self, data_collection, run_id): self.data_collection = data_collection self.dc = data_collection self.run_id = run_id self.summary_train_loss = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_loss') self.summary_train_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_acc') self.summary_test_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'test'), 'test_acc')
def __init__(self, data_collection, run_id, **kwargs): self.data_collection = data_collection self.run_id = run_id self.kwargs = kwargs base_path = path_join(SUMMARY_PATH, self.run_id) test_path = path_join(base_path, 'test') train_path = path_join(base_path, 'train') self.test_path = test_path self.train_path = train_path self.summary_train_loss = TFSummaryScalar(train_path, 'train_loss') # self.summary_train_acc = TFSummaryScalar(train_path, 'train_acc') self.summary_test_acc = TFSummaryScalar(test_path, 'test_acc')
def load_sample_image(): sample_IMAGE_PATH = path_join(HEAD_PATH, 'sample/images') sample_MASK_PATH = path_join(HEAD_PATH, 'sample/masks') sample_size = 7 limit = None print(f'collect sample images') train_images, _, _ = collect_images(sample_IMAGE_PATH, limit=limit) train_images = train_images.reshape([-1, 101, 101, 1]) print(f'collect sample images') train_mask_images, _, _ = collect_images(sample_MASK_PATH, limit=limit) train_mask_images = train_mask_images.reshape([-1, 101, 101, 1]) x = train_images y = train_mask_images return x, y
def train(self, n_epoch=None, callbacks=None, datas=None): clf = self.model if datas is None: datas = self.init_dataset() train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot = self.encode_datas(datas) if callbacks is None: dc = collect_data_callback( train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot ) callbacks = [ dc, log_callback(dc), summary_callback(dc, clf.run_id), BestSave(path_join(INSTANCE_PATH, clf.run_id), max_best=True).trace_on(dc, 'test_score'), # TriangleLRScheduler(7, 0.001, 0.0005), ReduceLrOnPlateau(0.5, 5, 0.0001, min_best=False).trace_on(dc, 'test_score'), # EarlyStop(10).trace_on(dc, 'test_score'), ] n_epoch = 50 # clf.init_adam_momentum() clf.update_learning_rate(0.01) clf.train( train_x_enc, train_y_onehot, epoch=n_epoch, epoch_callbacks=callbacks, )
def plot_mask_image(self, model, dataset, metric, epoch): run_id = model.run_id x, y = dataset.next_batch(20, update_cursor=False) predict = model.predict(x) predict = predict * 255 proba = model.predict_proba(x) proba = proba.reshape([-1, 101, 101, 1]) * 255 predict = predict.reshape([-1, 101, 101, 1]) def scramble_column(*args, size=10): ret = [] for i in range(0, len(args[0]), size): for j in range(len(args)): ret += [args[j][i:i + size]] return np.concatenate(ret, axis=0) np_tile = scramble_column(x, y, predict, proba) self.plot.plot_image_tile(np_tile, title=f'predict_epoch({epoch})', column=10, path=path_join( PLOT_PATH, run_id, f'predict_mask/({epoch}).png'))
def fold_train(self, epoch=50, k=7): models = [] for fold in range(2, k): clf = self.new_model() models += [clf] datas = self.init_dataset(k, fold) train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot = self.encode_datas(datas) dc = collect_data_callback( train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot ) callbacks = [ dc, log_callback(dc), summary_callback(dc, clf.run_id), BestSave(path_join(INSTANCE_PATH, f'fold_{fold}'), max_best=True).trace_on(dc, 'test_score'), # TriangleLRScheduler(7, 0.001, 0.0005), ReduceLrOnPlateau(0.7, 5, 0.0001), # EarlyStop(16), ] clf.update_learning_rate(0.01) clf.train( train_x_enc, train_y_onehot, epoch=epoch, epoch_callbacks=callbacks )
def pipeline(self, train_cache=False, predict_cache=False): self.train_models(cache=train_cache) predict_df = self.predict(cache=predict_cache) inverse_predict_df = self.transformer.inverse_transform(predict_df) transformed_result_df = self.transform_to_result(inverse_predict_df) save_samsung(transformed_result_df, path_join(path_head, 'result_predict.csv'))
def plot_non_mask_rate_iou(self, model, dataset, metric, epoch): run_id = model.run_id test_y = self.data_collection.test_y_dec test_predict = self.data_collection.test_predict_dec size = 512 test_y = test_y[:size] test_predict = test_predict[:size] xs = masks_rate(test_y) xs = xs.reshape([-1]) xs /= 255 ys = np.array([ Metrics.miou(true, predict) for true, predict in zip(test_y, test_predict) ]) dots = np.array([[x, y] for x, y in zip(xs, ys)]) self.plot.scatter_2d(dots, title=f'test set mask rate and iou', path=path_join( PLOT_PATH, run_id, f'test_set_mask_rate_iou/({epoch}).png'), x_label='mask_rate', y_label='iou')
def test_HousePrices_dataset(): dataset_path = """C:\\Users\\demetoir_desktop\\PycharmProjects\\MLtools\\data\\HousePrices""" merge_df = HousePricesHelper.load_merge_set(dataset_path) merge_null_clean = HousePricesHelper.null_cleaning(merge_df) merge_type_cast = HousePricesHelper.type_casting(merge_null_clean) transformed = HousePricesHelper.transform(merge_type_cast) train_df, test_df = HousePricesHelper.train_test_split(transformed) train_df.to_csv(path_join(dataset_path, 'transformed_train.csv'), index=False) test_df.to_csv(path_join(dataset_path, 'transformed_test.csv'), index=False)
def __call__(self, model, dataset, metric, epoch): sign = 1 if self.max_best else -1 self.log( f'\n' f'{self.name} current top_k\n' f'{pformat(self.top_k[1:])}\n' ) try: for i in reversed(range(1, self.k + 1)): if sign * self.top_k[i - 1] > sign * metric > sign * self.top_k[i]: # update top_k self.top_k.insert(i, metric) self.top_k.pop(self.k + 1) # dump top_k json dump_json(self.top_k, path_join(self.path, 'top_k.json')) self.log( f'update top_k at {i}th, metric = {metric}\n' f'{pformat(self.top_k[1:])}' ) if self.save_model: # del worst dir shutil.rmtree(path_join(self.path, f'top_{self.k}')) # shift dir path_pairs = [ ( path_join(self.path, f'top_{idx}'), path_join(self.path, f'top_{idx+1}') ) for idx in range(i, self.k) ] path_pairs = list(reversed(path_pairs)) for src, dst in path_pairs: os.rename(src, dst) # save model save_path = path_join(self.path, f'top_{i}') model.save(save_path) break except BaseException as e: print(error_trace(e)) raise RuntimeError(f'while Top k save, raise {e}')
def test_train_dataset_callback(): sample_IMAGE_PATH = path_join(HEAD_PATH, 'sample/images') sample_MASK_PATH = path_join(HEAD_PATH, 'sample/masks') sample_size = 7 limit = None print(f'collect sample images') train_images, _, _ = collect_images(sample_IMAGE_PATH, limit=limit) train_images = train_images.reshape([-1, 101, 101]) print(f'collect sample images') train_mask_images, _, _ = collect_images(sample_MASK_PATH, limit=limit) train_mask_images = train_mask_images.reshape([-1, 101, 101]) x = train_images y = train_mask_images import cv2 x = np.array([cv2.resize(a, (128, 128)) for a in x]).reshape([-1, 128, 128, 1]) y = np.array([cv2.resize(a, (128, 128)) for a in y]).reshape([-1, 128, 128, 1]) y_gt = y y_encode = mask_label_encoder.to_label(y) print(x.shape) print(y_encode.shape) Unet = SemanticSegmentation(stage=4, batch_size=7) # Unet.train(x, y_encode, epoch=100) Unet.train(x, y_encode, epoch=1000, dataset_callback=dataset_callback) Unet.train(x, y_encode, epoch=1000, dataset_callback=dataset_callback) score = Unet.score(x, y_encode) pprint(score) predict = Unet.predict(x) pprint(predict[0]) pprint(predict.shape) proba = Unet.predict_proba(x) pprint(proba[0]) pprint(proba.shape) metric = Unet.metric(x, y_encode) print(metric) predict = mask_label_encoder.from_label(predict) plot.plot_image_tile(np.concatenate([x, predict, y_gt], axis=0), title='predict', column=sample_size)
def _plot_scatter(model, x, y, plot, name): predict = model.predict(x) gt = np.array([[idx, y] for idx, y in enumerate(y)]) predict = np.array([[idx, x] for idx, x in enumerate(predict)]) plot.scatter_2d( gt, predict, labels=['gt', 'predict'], title=f'mask_rate_{name}_{epoch}', path=path_join(PLOT_PATH, self.run_id, f'{name}', f'({epoch})'))
def save_checkpoint(self, path): setup_directory(path) check_point_path = path_join(path, 'check_point', 'instance.ckpt') setup_file(check_point_path) saver = tf.train.Saver(self.main_graph_var_list + self.misc_ops_var_list) saver.save(self.sess, check_point_path)
def restore(self, path, var_list=None): self.log.info(f'restore from {path}') if var_list is None: var_list = self.main_graph_var_list + self.misc_ops_var_list saver = tf.train.Saver(var_list) saver.restore(self.sess, path_join(path, 'check_point', 'instance.ckpt'))
def load(self, path): pickle_path = path_join(path, 'transformed.pkl') if not os.path.exists(pickle_path) or not self.caching: df = init_samsung(cache=True) self.transformer = transformer() df = self.transformer.transform(df) df = df.drop_duplicates(keep='first') for key in df: self.add_data(key, df[key]) self.to_pickle(pickle_path) trans_csv_path = path_join(path, 'data_transformed.csv') save_samsung(df, trans_csv_path) else: self.from_pickle(pickle_path, overwrite_self=True)
def save_tf_summary_params(path, params): with tf.Session() as sess: run_id = params['run_id'] path = path_join(path, run_id) summary_params = TFSummaryParams(path, 'params') summary_params.update(sess, params) summary_params.flush() summary_params.close() print(f'TFSummaryParams save at {path}')
def download_and_unzip(path=None): if path is None: path = f'./TGS_salt' import subprocess cmd = 'kaggle competitions download -c tgs-salt-identification-challenge' cmd += f' -p {path}' # cmd += f' --unzip' subprocess.run(cmd) train_zip = 'train.zip' test_zip = 'test.zip' train_extract = 'train' test_extract = 'test' extract_zip(path_join(path, train_zip), path_join(path, train_extract)) extract_zip(path_join(path, test_zip), path_join(path, test_extract))
def to_kaggle_submit_csv(path, Ys): if path is None: path = path_join('.', 'submit.csv') df = pd.DataFrame() df[PASSENGERID] = [i for i in range(892, 1309 + 1)] df[SURVIVED] = Ys df.to_csv(path, index=False)
def build_dataset(path): merge_df = load_merge_set() cleaner = titanic_null_cleaner(merge_df, df_Xs_keys, df_Ys_key) cleaner.boilerplate_maker('./titanic_cleaner.py') merge_df = cleaner.clean() typecaster = titanic_typecasting(merge_df, df_Xs_keys, df_Ys_key) typecaster.boilerplate_maker('./titanic_typecaster.py') merge_df = typecaster.type_cast() transformer = titanic_transformer(merge_df, df_Xs_keys, df_Ys_key) transformer.boilerplate_maker('./titanic_transformer.py') merge_df = transformer.transform() train, test = split_train_test(merge_df) train.to_csv(path_join(path, 'trans_train.csv'), index=False) test.to_csv(path_join(path, 'trans_test.csv'), index=False)
def plot_image(self, np_img, title=None, path=None, **kwargs): if title is None: title = time_stamp() + self.finger_print(6) extend = self.extend if path is None: path = path_join('.', 'matplot', title + extend) setup_file(path) np_image_save(np_img, path)
def __init__(self, model, train_x, train_y, test_x, test_y, params): super().__init__() self.train_x = train_x self.train_y = train_y self.model = model self.test_x = test_x self.test_y = test_y self.params = params self.run_id = self.params['run_id'] self.top_k_save = Top_k_save(path_join(INSTANCE_PATH, self.run_id, 'top_k'), max_best=False) self.summary_train_loss = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_loss') self.summary_train_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_acc') self.summary_test_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'test'), 'test_acc') self.sample_size = 200 self.sample_train_x, self.sample_train_y = self.make_plot_data(self.test_x, self.test_y, self.sample_size) self.sample_test_x, self.sample_test_y = self.make_plot_data(self.train_x, self.train_y, self.sample_size)
def __init__(self, source_model, source_scope, verbose=0): super().__init__(verbose=verbose) if not source_model.is_built: raise RuntimeError(f'transfer fail, source model must be built') self.source_model = source_model self.source_scope = source_scope self.temp_dir = f'./temp_transfer' self.temp_path = path_join(self.temp_dir, time_stamp())
def __init__(self, path, k=5, max_best=True, save_model=True, name='top_k_save', log=print): self.path = path self.k = k self.max_best = max_best self.save_model = save_model self.name = name self.log = log self.top_k_json_path = path_join(self.path, 'top_k.json') if os.path.exists(self.top_k_json_path): self.top_k = load_json(self.top_k_json_path) else: if self.max_best: self.top_k = [np.Inf] + [-np.Inf for _ in range(self.k)] else: self.top_k = [-np.Inf] + [np.Inf for _ in range(self.k)] if self.save_model: for i in range(1, self.k + 1): setup_directory(path_join(self.path, f'top_{i}'))
def init_samsung(cache=True): init_csv_path = path_join(path_head, 'data_init.csv') if not os.path.exists(init_csv_path) or not cache: df = load_samsung(origin_csv_path) # train columns to make like test columns path = path_join(path_head, 'test_kor.csv') test_df = load_samsung(path) test_cols = test_df.columns df = DF(df[test_cols]) df = add_col_num(df, 2) # drop null include rows idx = df[df['c15_당사자종별_2당_대분류'].isna()].index df = drop_rows(df, idx) df = DF(df) save_samsung(df, init_csv_path) else: df = load_samsung(init_csv_path) return df
def test_df(self): if self._test_df is None: path = path_join(path_head, 'test_kor.csv') test_df = load_samsung(path) test_df = add_col_num(test_df) test_df = self.fill_rand(test_df) test_df = self.fill_inference_able(test_df) save_samsung(test_df, './test.csv') self._test_df = test_df return self._test_df
def load(self, path): pkl_path = path_join(path, 'test.pkl') if not os.path.exists(pkl_path) or not self.caching: make_data_pkl() pkl = load_pickle(pkl_path) self.add_data('image', pkl['image']) self.add_data('id', pkl['id']) self.add_data('depth', pkl['depths']) self.add_data('depth_image', pkl['depth_image']) # self.x_keys = ['image', 'depth'] self.x_keys = ['image']
def save(self, path=None): if not self.is_built: raise RuntimeError(f'can not save un built model, {self}') if not self.sessionManager.is_opened: raise RuntimeError(f'can not save model without session {self}') if path is None: self.log.info( 'save directory not specified, use default directory') path = path_join(ROOT_PATH, 'instance', self.metadata.id) self.log.info("save at {}".format(path)) self.save_checkpoint(path) self.save_meta(path)
def transform_to_result(self, predict_df): result_df = load_samsung(path_join(path_head, 'result_kor.csv')) size = len(result_df) predict_cols = list(predict_df.columns) result_cols = [str.upper(a) for a in 'abcdefghijklmnopqrstuvwxyz'] result_cols = result_cols[:len(predict_cols)] result_col_tp_result_col = dict(zip(result_cols, predict_cols)) for i in range(size): a = result_df.loc[i, :] row = int(a['열']) - 2 col = a['행'] predict_col = result_col_tp_result_col[col] result_df.loc[i, '값'] = predict_df.loc[row, predict_col] # print(row, col, type(row), type(col), predict_col) return result_df