Ejemplo n.º 1
0
    def load_merge_set(path):
        def df_add_col_num(df, zfill_width=None):
            if zfill_width is None:
                zfill_width = 0

            mapping = {}
            for idx, key in enumerate(df.keys()):
                mapping[key] = f'col_{str(idx).zfill(zfill_width)}_{key}'

            return df.rename(mapping, axis='columns')

        merged_path = path_join(path, 'merged.csv')
        if os.path.exists(merged_path):
            merged = pd.read_csv(merged_path)
        else:
            train_path = path_join(path, 'train.csv')
            train = pd.read_csv(train_path)

            test_path = path_join(path, 'test.csv')
            test = pd.read_csv(test_path)

            merged = pd.concat([train, test], axis=0)
            merged = df_add_col_num(merged, zfill_width=2)

            merged.to_csv(merged_path, index=False)
        return merged
Ejemplo n.º 2
0
    def save_meta(self, path):
        setup_directory(path)

        self.metadata.save(path_join(path, 'meta.pkl'))
        self.metadata.save(path_join(path, 'meta.json'))

        self.params_path = path_join(path, 'params.pkl')
        self._save_params(self.params_path)
Ejemplo n.º 3
0
    def __init__(self, data_collection, run_id):
        self.data_collection = data_collection
        self.dc = data_collection

        self.run_id = run_id
        self.summary_train_loss = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_loss')
        self.summary_train_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_acc')
        self.summary_test_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'test'), 'test_acc')
Ejemplo n.º 4
0
    def __init__(self, data_collection, run_id, **kwargs):
        self.data_collection = data_collection
        self.run_id = run_id
        self.kwargs = kwargs

        base_path = path_join(SUMMARY_PATH, self.run_id)
        test_path = path_join(base_path, 'test')
        train_path = path_join(base_path, 'train')
        self.test_path = test_path
        self.train_path = train_path
        self.summary_train_loss = TFSummaryScalar(train_path, 'train_loss')
        # self.summary_train_acc = TFSummaryScalar(train_path, 'train_acc')
        self.summary_test_acc = TFSummaryScalar(test_path, 'test_acc')
Ejemplo n.º 5
0
def load_sample_image():
    sample_IMAGE_PATH = path_join(HEAD_PATH, 'sample/images')
    sample_MASK_PATH = path_join(HEAD_PATH, 'sample/masks')

    sample_size = 7
    limit = None
    print(f'collect sample images')
    train_images, _, _ = collect_images(sample_IMAGE_PATH, limit=limit)
    train_images = train_images.reshape([-1, 101, 101, 1])
    print(f'collect sample images')
    train_mask_images, _, _ = collect_images(sample_MASK_PATH, limit=limit)
    train_mask_images = train_mask_images.reshape([-1, 101, 101, 1])
    x = train_images
    y = train_mask_images

    return x, y
Ejemplo n.º 6
0
    def train(self, n_epoch=None, callbacks=None, datas=None):
        clf = self.model

        if datas is None:
            datas = self.init_dataset()

        train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot = self.encode_datas(datas)

        if callbacks is None:
            dc = collect_data_callback(
                train_x_enc, train_y_onehot,
                valid_x_enc, valid_y_onehot
            )
            callbacks = [
                dc,
                log_callback(dc),
                summary_callback(dc, clf.run_id),
                BestSave(path_join(INSTANCE_PATH, clf.run_id), max_best=True).trace_on(dc, 'test_score'),
                # TriangleLRScheduler(7, 0.001, 0.0005),
                ReduceLrOnPlateau(0.5, 5, 0.0001, min_best=False).trace_on(dc, 'test_score'),
                # EarlyStop(10).trace_on(dc, 'test_score'),
            ]

        n_epoch = 50
        # clf.init_adam_momentum()
        clf.update_learning_rate(0.01)
        clf.train(
            train_x_enc, train_y_onehot, epoch=n_epoch, epoch_callbacks=callbacks,
        )
Ejemplo n.º 7
0
    def plot_mask_image(self, model, dataset, metric, epoch):
        run_id = model.run_id

        x, y = dataset.next_batch(20, update_cursor=False)

        predict = model.predict(x)
        predict = predict * 255
        proba = model.predict_proba(x)
        proba = proba.reshape([-1, 101, 101, 1]) * 255
        predict = predict.reshape([-1, 101, 101, 1])

        def scramble_column(*args, size=10):
            ret = []
            for i in range(0, len(args[0]), size):
                for j in range(len(args)):
                    ret += [args[j][i:i + size]]

            return np.concatenate(ret, axis=0)

        np_tile = scramble_column(x, y, predict, proba)
        self.plot.plot_image_tile(np_tile,
                                  title=f'predict_epoch({epoch})',
                                  column=10,
                                  path=path_join(
                                      PLOT_PATH, run_id,
                                      f'predict_mask/({epoch}).png'))
Ejemplo n.º 8
0
    def fold_train(self, epoch=50, k=7):

        models = []
        for fold in range(2, k):
            clf = self.new_model()
            models += [clf]

            datas = self.init_dataset(k, fold)
            train_x_enc, train_y_onehot, valid_x_enc, valid_y_onehot = self.encode_datas(datas)

            dc = collect_data_callback(
                train_x_enc, train_y_onehot,
                valid_x_enc, valid_y_onehot
            )
            callbacks = [
                dc,
                log_callback(dc),
                summary_callback(dc, clf.run_id),
                BestSave(path_join(INSTANCE_PATH, f'fold_{fold}'), max_best=True).trace_on(dc, 'test_score'),
                # TriangleLRScheduler(7, 0.001, 0.0005),
                ReduceLrOnPlateau(0.7, 5, 0.0001),
                # EarlyStop(16),
            ]

            clf.update_learning_rate(0.01)
            clf.train(
                train_x_enc, train_y_onehot, epoch=epoch, epoch_callbacks=callbacks
            )
Ejemplo n.º 9
0
    def pipeline(self, train_cache=False, predict_cache=False):
        self.train_models(cache=train_cache)
        predict_df = self.predict(cache=predict_cache)

        inverse_predict_df = self.transformer.inverse_transform(predict_df)
        transformed_result_df = self.transform_to_result(inverse_predict_df)
        save_samsung(transformed_result_df, path_join(path_head, 'result_predict.csv'))
Ejemplo n.º 10
0
    def plot_non_mask_rate_iou(self, model, dataset, metric, epoch):
        run_id = model.run_id
        test_y = self.data_collection.test_y_dec
        test_predict = self.data_collection.test_predict_dec

        size = 512
        test_y = test_y[:size]
        test_predict = test_predict[:size]

        xs = masks_rate(test_y)
        xs = xs.reshape([-1])
        xs /= 255

        ys = np.array([
            Metrics.miou(true, predict)
            for true, predict in zip(test_y, test_predict)
        ])

        dots = np.array([[x, y] for x, y in zip(xs, ys)])

        self.plot.scatter_2d(dots,
                             title=f'test set mask rate and iou',
                             path=path_join(
                                 PLOT_PATH, run_id,
                                 f'test_set_mask_rate_iou/({epoch}).png'),
                             x_label='mask_rate',
                             y_label='iou')
Ejemplo n.º 11
0
def test_HousePrices_dataset():
    dataset_path = """C:\\Users\\demetoir_desktop\\PycharmProjects\\MLtools\\data\\HousePrices"""

    merge_df = HousePricesHelper.load_merge_set(dataset_path)

    merge_null_clean = HousePricesHelper.null_cleaning(merge_df)

    merge_type_cast = HousePricesHelper.type_casting(merge_null_clean)

    transformed = HousePricesHelper.transform(merge_type_cast)

    train_df, test_df = HousePricesHelper.train_test_split(transformed)
    train_df.to_csv(path_join(dataset_path, 'transformed_train.csv'),
                    index=False)
    test_df.to_csv(path_join(dataset_path, 'transformed_test.csv'),
                   index=False)
Ejemplo n.º 12
0
    def __call__(self, model, dataset, metric, epoch):
        sign = 1 if self.max_best else -1

        self.log(
            f'\n'
            f'{self.name} current top_k\n'
            f'{pformat(self.top_k[1:])}\n'
        )

        try:
            for i in reversed(range(1, self.k + 1)):
                if sign * self.top_k[i - 1] > sign * metric > sign * self.top_k[i]:
                    # update top_k
                    self.top_k.insert(i, metric)
                    self.top_k.pop(self.k + 1)

                    # dump top_k json
                    dump_json(self.top_k, path_join(self.path, 'top_k.json'))
                    self.log(
                        f'update top_k at {i}th, metric = {metric}\n'
                        f'{pformat(self.top_k[1:])}'
                    )

                    if self.save_model:
                        # del worst dir
                        shutil.rmtree(path_join(self.path, f'top_{self.k}'))

                        # shift dir
                        path_pairs = [
                            (
                                path_join(self.path, f'top_{idx}'),
                                path_join(self.path, f'top_{idx+1}')
                            )
                            for idx in range(i, self.k)
                        ]
                        path_pairs = list(reversed(path_pairs))
                        for src, dst in path_pairs:
                            os.rename(src, dst)

                        # save model
                        save_path = path_join(self.path, f'top_{i}')
                        model.save(save_path)

                    break
        except BaseException as e:
            print(error_trace(e))
            raise RuntimeError(f'while Top k save, raise {e}')
Ejemplo n.º 13
0
def test_train_dataset_callback():
    sample_IMAGE_PATH = path_join(HEAD_PATH, 'sample/images')
    sample_MASK_PATH = path_join(HEAD_PATH, 'sample/masks')

    sample_size = 7
    limit = None
    print(f'collect sample images')
    train_images, _, _ = collect_images(sample_IMAGE_PATH, limit=limit)
    train_images = train_images.reshape([-1, 101, 101])
    print(f'collect sample images')
    train_mask_images, _, _ = collect_images(sample_MASK_PATH, limit=limit)
    train_mask_images = train_mask_images.reshape([-1, 101, 101])
    x = train_images
    y = train_mask_images

    import cv2

    x = np.array([cv2.resize(a, (128, 128)) for a in x]).reshape([-1, 128, 128, 1])
    y = np.array([cv2.resize(a, (128, 128)) for a in y]).reshape([-1, 128, 128, 1])
    y_gt = y

    y_encode = mask_label_encoder.to_label(y)
    print(x.shape)
    print(y_encode.shape)

    Unet = SemanticSegmentation(stage=4, batch_size=7)
    # Unet.train(x, y_encode, epoch=100)
    Unet.train(x, y_encode, epoch=1000, dataset_callback=dataset_callback)
    Unet.train(x, y_encode, epoch=1000, dataset_callback=dataset_callback)

    score = Unet.score(x, y_encode)
    pprint(score)

    predict = Unet.predict(x)
    pprint(predict[0])
    pprint(predict.shape)

    proba = Unet.predict_proba(x)
    pprint(proba[0])
    pprint(proba.shape)

    metric = Unet.metric(x, y_encode)
    print(metric)

    predict = mask_label_encoder.from_label(predict)
    plot.plot_image_tile(np.concatenate([x, predict, y_gt], axis=0), title='predict', column=sample_size)
Ejemplo n.º 14
0
 def _plot_scatter(model, x, y, plot, name):
     predict = model.predict(x)
     gt = np.array([[idx, y] for idx, y in enumerate(y)])
     predict = np.array([[idx, x] for idx, x in enumerate(predict)])
     plot.scatter_2d(
         gt, predict, labels=['gt', 'predict'],
         title=f'mask_rate_{name}_{epoch}',
         path=path_join(PLOT_PATH, self.run_id, f'{name}', f'({epoch})'))
Ejemplo n.º 15
0
    def save_checkpoint(self, path):
        setup_directory(path)

        check_point_path = path_join(path, 'check_point', 'instance.ckpt')
        setup_file(check_point_path)
        saver = tf.train.Saver(self.main_graph_var_list +
                               self.misc_ops_var_list)
        saver.save(self.sess, check_point_path)
Ejemplo n.º 16
0
    def restore(self, path, var_list=None):
        self.log.info(f'restore from {path}')

        if var_list is None:
            var_list = self.main_graph_var_list + self.misc_ops_var_list

        saver = tf.train.Saver(var_list)
        saver.restore(self.sess, path_join(path, 'check_point',
                                           'instance.ckpt'))
Ejemplo n.º 17
0
    def load(self, path):
        pickle_path = path_join(path, 'transformed.pkl')
        if not os.path.exists(pickle_path) or not self.caching:
            df = init_samsung(cache=True)

            self.transformer = transformer()
            df = self.transformer.transform(df)
            df = df.drop_duplicates(keep='first')

            for key in df:
                self.add_data(key, df[key])

            self.to_pickle(pickle_path)

            trans_csv_path = path_join(path, 'data_transformed.csv')
            save_samsung(df, trans_csv_path)
        else:
            self.from_pickle(pickle_path, overwrite_self=True)
Ejemplo n.º 18
0
def save_tf_summary_params(path, params):
    with tf.Session() as sess:
        run_id = params['run_id']
        path = path_join(path, run_id)
        summary_params = TFSummaryParams(path, 'params')
        summary_params.update(sess, params)
        summary_params.flush()
        summary_params.close()
        print(f'TFSummaryParams save at {path}')
Ejemplo n.º 19
0
def download_and_unzip(path=None):
    if path is None:
        path = f'./TGS_salt'

    import subprocess

    cmd = 'kaggle competitions download -c tgs-salt-identification-challenge'
    cmd += f' -p {path}'
    # cmd += f' --unzip'
    subprocess.run(cmd)

    train_zip = 'train.zip'
    test_zip = 'test.zip'
    train_extract = 'train'
    test_extract = 'test'

    extract_zip(path_join(path, train_zip), path_join(path, train_extract))
    extract_zip(path_join(path, test_zip), path_join(path, test_extract))
Ejemplo n.º 20
0
    def to_kaggle_submit_csv(path, Ys):
        if path is None:
            path = path_join('.', 'submit.csv')
        df = pd.DataFrame()

        df[PASSENGERID] = [i for i in range(892, 1309 + 1)]
        df[SURVIVED] = Ys

        df.to_csv(path, index=False)
Ejemplo n.º 21
0
def build_dataset(path):
    merge_df = load_merge_set()

    cleaner = titanic_null_cleaner(merge_df, df_Xs_keys, df_Ys_key)
    cleaner.boilerplate_maker('./titanic_cleaner.py')
    merge_df = cleaner.clean()

    typecaster = titanic_typecasting(merge_df, df_Xs_keys, df_Ys_key)
    typecaster.boilerplate_maker('./titanic_typecaster.py')
    merge_df = typecaster.type_cast()

    transformer = titanic_transformer(merge_df, df_Xs_keys, df_Ys_key)
    transformer.boilerplate_maker('./titanic_transformer.py')
    merge_df = transformer.transform()

    train, test = split_train_test(merge_df)

    train.to_csv(path_join(path, 'trans_train.csv'), index=False)
    test.to_csv(path_join(path, 'trans_test.csv'), index=False)
Ejemplo n.º 22
0
    def plot_image(self, np_img, title=None, path=None, **kwargs):
        if title is None:
            title = time_stamp() + self.finger_print(6)

        extend = self.extend
        if path is None:
            path = path_join('.', 'matplot', title + extend)
        setup_file(path)

        np_image_save(np_img, path)
Ejemplo n.º 23
0
    def __init__(self, model, train_x, train_y, test_x, test_y, params):
        super().__init__()
        self.train_x = train_x
        self.train_y = train_y
        self.model = model
        self.test_x = test_x
        self.test_y = test_y
        self.params = params

        self.run_id = self.params['run_id']

        self.top_k_save = Top_k_save(path_join(INSTANCE_PATH, self.run_id, 'top_k'), max_best=False)

        self.summary_train_loss = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_loss')
        self.summary_train_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'train'), 'train_acc')
        self.summary_test_acc = TFSummaryScalar(path_join(SUMMARY_PATH, self.run_id, 'test'), 'test_acc')

        self.sample_size = 200
        self.sample_train_x, self.sample_train_y = self.make_plot_data(self.test_x, self.test_y, self.sample_size)
        self.sample_test_x, self.sample_test_y = self.make_plot_data(self.train_x, self.train_y, self.sample_size)
Ejemplo n.º 24
0
    def __init__(self, source_model, source_scope, verbose=0):
        super().__init__(verbose=verbose)

        if not source_model.is_built:
            raise RuntimeError(f'transfer fail, source model must be built')

        self.source_model = source_model
        self.source_scope = source_scope

        self.temp_dir = f'./temp_transfer'
        self.temp_path = path_join(self.temp_dir, time_stamp())
Ejemplo n.º 25
0
    def __init__(self, path, k=5, max_best=True, save_model=True, name='top_k_save', log=print):
        self.path = path
        self.k = k
        self.max_best = max_best
        self.save_model = save_model
        self.name = name
        self.log = log

        self.top_k_json_path = path_join(self.path, 'top_k.json')
        if os.path.exists(self.top_k_json_path):
            self.top_k = load_json(self.top_k_json_path)
        else:
            if self.max_best:
                self.top_k = [np.Inf] + [-np.Inf for _ in range(self.k)]
            else:
                self.top_k = [-np.Inf] + [np.Inf for _ in range(self.k)]

        if self.save_model:
            for i in range(1, self.k + 1):
                setup_directory(path_join(self.path, f'top_{i}'))
Ejemplo n.º 26
0
def init_samsung(cache=True):
    init_csv_path = path_join(path_head, 'data_init.csv')
    if not os.path.exists(init_csv_path) or not cache:
        df = load_samsung(origin_csv_path)
        # train columns to make like test columns
        path = path_join(path_head, 'test_kor.csv')
        test_df = load_samsung(path)
        test_cols = test_df.columns
        df = DF(df[test_cols])
        df = add_col_num(df, 2)

        # drop null include rows
        idx = df[df['c15_당사자종별_2당_대분류'].isna()].index
        df = drop_rows(df, idx)
        df = DF(df)

        save_samsung(df, init_csv_path)
    else:
        df = load_samsung(init_csv_path)

    return df
Ejemplo n.º 27
0
    def test_df(self):
        if self._test_df is None:
            path = path_join(path_head, 'test_kor.csv')
            test_df = load_samsung(path)
            test_df = add_col_num(test_df)
            test_df = self.fill_rand(test_df)
            test_df = self.fill_inference_able(test_df)
            save_samsung(test_df, './test.csv')

            self._test_df = test_df

        return self._test_df
Ejemplo n.º 28
0
    def load(self, path):
        pkl_path = path_join(path, 'test.pkl')
        if not os.path.exists(pkl_path) or not self.caching:
            make_data_pkl()

        pkl = load_pickle(pkl_path)

        self.add_data('image', pkl['image'])
        self.add_data('id', pkl['id'])
        self.add_data('depth', pkl['depths'])
        self.add_data('depth_image', pkl['depth_image'])
        # self.x_keys = ['image', 'depth']
        self.x_keys = ['image']
Ejemplo n.º 29
0
    def save(self, path=None):
        if not self.is_built:
            raise RuntimeError(f'can not save un built model, {self}')

        if not self.sessionManager.is_opened:
            raise RuntimeError(f'can not save model without session {self}')

        if path is None:
            self.log.info(
                'save directory not specified, use default directory')
            path = path_join(ROOT_PATH, 'instance', self.metadata.id)

        self.log.info("save at {}".format(path))
        self.save_checkpoint(path)
        self.save_meta(path)
Ejemplo n.º 30
0
    def transform_to_result(self, predict_df):
        result_df = load_samsung(path_join(path_head, 'result_kor.csv'))
        size = len(result_df)
        predict_cols = list(predict_df.columns)
        result_cols = [str.upper(a) for a in 'abcdefghijklmnopqrstuvwxyz']
        result_cols = result_cols[:len(predict_cols)]

        result_col_tp_result_col = dict(zip(result_cols, predict_cols))

        for i in range(size):
            a = result_df.loc[i, :]
            row = int(a['열']) - 2
            col = a['행']
            predict_col = result_col_tp_result_col[col]
            result_df.loc[i, '값'] = predict_df.loc[row, predict_col]
            # print(row, col, type(row), type(col), predict_col)

        return result_df