Example #1
0
def get_dataset():
    train, test = datasets.get_mnist()
    train = datasets.TransformDataset(train, lambda x: _transform(x, True))
    test = datasets.TransformDataset(train, _transform)
    return {
        'train': train,
        'test': test,
    }
Example #2
0
def load_images(IMG_DIR):
    dir_names = glob.glob('{}/*'.format(IMG_DIR))
    file_names = [glob.glob('{}/*.jpg'.format(dir)) for dir in dir_names]
    file_names = list(chain.from_iterable(file_names))
    labels = [os.path.basename(os.path.dirname(file)) for file in file_names]
    dir_names = [os.path.basename(dir) for dir in dir_names]
    labels = [dir_names.index(label) for label in labels]

    d = datasets.LabeledImageDataset(list(zip(file_names, labels)))

    def resize(img):
        width, height = 224, 224
        img = Image.fromarray(img.transpose(1, 2, 0))
        img = img.resize((width, height), Image.BICUBIC)
        return np.asarray(img).transpose(2, 0, 1)

    def transform(inputs):
        img, label = inputs
        img = img[:3, ...]
        img = resize(img.astype(np.uint8))
        img = img.astype(np.float32)
        img = img / 255
        return img, label

    transformed_d = datasets.TransformDataset(d, transform)
    return transformed_d
Example #3
0
def get_dataset():
    train, test = datasets.get_mnist()
    validation, train = datasets.split_dataset_random(train, 5000)
    train = datasets.TransformDataset(train, _transform)
    return {
        'train': train,
        'validation': validation,
        'test': test,
    }
Example #4
0
def main(args):
    # print learning settings.
    print('GPU: {}'.format(args.gpu))
    print('# Mini-batch size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('Using {} dataset.'.format(args.dataset))
    print('')
    # Load datasets.
    trainset, testset = get_dataset(args.dataset)
    # Data transfomer
    transformer = Transformer(trainset,
                              pca=False,
                              normalize=args.normalize,
                              trans=args.augment)
    # Make transform datasets.
    trainset = D.TransformDataset(trainset, transformer.train)
    testset = D.TransformDataset(testset, transformer.test)
    # Setup dataset iterators.
    train_iter = BCIterator(trainset, args.batchsize, classes=args.classes)
    test_iter = chainer.iterators.SerialIterator(testset, args.batchsize,
                                                 False, False)
    # Set CNN model.
    model = MultiplexClassifier(get_model(args.model, args.classes),
                                lossfun=kl_divergence,
                                accfun=accuracy_mix)
    # Setup GPU
    if args.gpu >= 0:
        cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()
    # Run to get model information.
    one_predict(model, train_iter, args.gpu)
    print(str_info(model))
    # setup trainer
    trainer = setup_trainer(args, train_iter, test_iter, model)
    # Run to get model information.
    one_predict(model, train_iter, args.gpu)
    print(str_info(model))
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)
    # run
    trainer.run()
Example #5
0
    def test_transform_dataset(self):
        td = datasets.TransformDataset(self.dataset, self.transform)
        self.assertEqual(len(td), len(self.dataset))

        for i in range(len(td)):
            example = td[i]
            if isinstance(example, tuple):
                for j, arr in enumerate(example):
                    numpy.testing.assert_array_equal(
                        arr, self.transform(self.dataset[i][j]))
            else:
                numpy.testing.assert_array_equal(
                    example, self.transform(self.dataset[i]))
Example #6
0
    def on_status(self, status):
        status.created_at += datetime.timedelta(hours=9)

        # リプライが来た場合
        if str(status.in_reply_to_screen_name) == bot_user_name:

            # テキストメッセージ
            tweet_text = "@" + str(status.user.screen_name) + " "

            # タイムラインを取得
            time_line = api.mentions_timeline()

            # タイムラインの先頭のメッセージ内容
            print("リプライが届きました...\n[@" + status.user.screen_name + "]\n" +
                  time_line[0].text + "\n")

            # ファイル名の先頭
            date_name = re.split(' ', str(datetime.datetime.today()))[0] + '_'

            # 1.リプライ画像の保存 -> 2.顔を切り取りcat.jpgで保存 -> 3.chainerに通して判定

            # 1.リプライ画像の保存
            try:
                j = 0
                reply_images = []
                for img in time_line[0].extended_entities['media']:
                    # print(img['media_url'])
                    reply_image = urllib.request.urlopen(img['media_url'])
                    # ファイル名を確定後、リストに格納
                    image_name = date_name + str(
                        time_line[0].id) + '-' + str(j) + '.jpg'
                    reply_images.append(image_name)
                    # 画像を読み込んで保存
                    image_file = open(image_name, 'wb')
                    image_file.write(reply_image.read())
                    image_file.close()
                    reply_image.close()
                    print('画像 ' + image_name + ' を保存しました')
                    j = j + 1
            except:
                # 例外処理
                if j == 0:
                    tweet_text += "Error:画像がありませんฅ(´・ω・`)ฅにゃーん"
                else:
                    tweet_text += "Error:画像の保存に失敗しましたฅ(´・ω・`)ฅにゃーん"
                api.update_status(status=tweet_text,
                                  in_reply_to_status_id=status.id)
                print(tweet_text)
                return True

            # 2.顔を切り取りcat.jpgで保存
            try:
                image = cv2.imread(reply_images[0])
                image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                cascade = cv2.CascadeClassifier("cat_cascade.xml")
                face_images = cascade.detectMultiScale(image_gray,
                                                       scaleFactor=1.1,
                                                       minNeighbors=1,
                                                       minSize=(1, 1))
                face_image_len = 0
                if len(face_images) > 0:
                    for (x, y, w, h) in face_images:
                        face_image = image[y:y + h, x:x + w]
                        if face_image_len < w:
                            face_image_len = w
                            cv2.imwrite("cat_face.jpg", face_image)
                            face_image = cv2.resize(face_image, (64, 64))
                            cv2.imwrite("cat_face_min.jpg", face_image)
                else:
                    tweet_text += "Error:猫の顔が検出できませんでした...ฅ(´・ω・`)ฅにゃーん"
                    api.update_status(status=tweet_text,
                                      in_reply_to_status_id=status.id)
                    print(tweet_text)
                    return True
            except:
                tweet_text += "Error:猫の顔の検出に失敗しました...ฅ(´・ω・`)ฅにゃーん"
                api.update_status(status=tweet_text,
                                  in_reply_to_status_id=status.id)
                print(tweet_text)
                return True

            # 3.chainerに通して判定
            try:
                data = [('cat_face_min.jpg', 3), ('cat_face_min.jpg', 3)]
                d = datasets.LabeledImageDataset(data)

                def transform(data):
                    img, lable = data
                    img = img / 255.
                    return img, lable

                d = datasets.TransformDataset(d, transform)

                train, test = datasets.split_dataset(d, 1)
                x, t = test[0]
                x = x[None, ...]
                y = self.model(x)
                y = y.data

                cats = [
                    "スフィンクス", "アビシニアン", "ベンガル", "バーマン", "ボンベイ",
                    "ブリティッシュショートヘア", "エジプシャンマウ", "メインクーン", "ペルシャ", "ラグドール",
                    "ロシアンブルー", "シャム"
                ]
                cats_images = [
                    "Sphynx.jpg", "Abyssinian.jpg", "Bengal.jpg", "Birman.jpg",
                    "Bombay.jpg", "British_Shorthair.jpg", "Egyptian_Mau.jpg",
                    "Maine_Coon.jpg", "Persian.jpg", "Ragdoll.jpg",
                    "Russian_Blue.jpg", "Siamese.jpg"
                ]

                tweet_text += "この猫は... " + cats[y.argmax(
                    axis=1)[0]] + " ですฅ(´・ω・`)ฅにゃーん"

                media_images = [
                    "cat_face.jpg",
                    "./cat_images/" + cats_images[y.argmax(axis=1)[0]]
                ]
                media_ids = [
                    api.media_upload(i).media_id_string for i in media_images
                ]
                api.update_status(status=tweet_text,
                                  in_reply_to_status_id=status.id,
                                  media_ids=media_ids)
                print(tweet_text)
                return True

            except:
                tweet_text += "Error:猫の顔の判定に失敗しました...ฅ(´・ω・`)ฅにゃーん"
                api.update_status(status=tweet_text,
                                  in_reply_to_status_id=status.id)
                print(tweet_text)
                return True

        return True
Example #7
0
def train(settings: dict, output_path: PosixPath):
    """Main."""
    gpu_num = len(settings["gpu_devices"])
    # # make dataset
    # # # read meta info.
    train_df = pd.read_csv(
        config.PROC_DATA /
        "train_add-{}fold-index.csv".format(settings["n_folds"]))

    # # # make label arr
    train_labels_arr = train_df[config.COMP_NAMES].values.astype("i")

    # # # make train set
    if settings["val_fold"] != -1:
        train_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((train_df[train_df["fold"] != settings["val_fold"]]
                     ["image_id"] + ".png").tolist(),
                    train_labels_arr[train_df["fold"] != settings["val_fold"],
                                     ...])),
            root=config.TRAIN_IMAGES_DIR.as_posix())
    else:
        train_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((train_df["image_id"] + ".png").tolist(),
                    train_labels_arr)),
            root=config.TRAIN_IMAGES_DIR.as_posix())

    train_dataset = datasets.TransformDataset(
        train_dataset,
        nn_training.ImageTransformer(settings["training_transforms"]))

    if gpu_num > 1:
        # # if using multi-gpu, split train set into gpu_num.
        train_sub_dataset_list = []
        total_size = len(train_dataset)
        subset_size = (total_size + gpu_num - 1) // gpu_num
        np.random.seed(1086)
        random_order = np.random.permutation(len(train_dataset))
        for i in range(gpu_num):
            start_idx = min(i * subset_size, total_size - subset_size)
            end_idx = min((i + 1) * subset_size, total_size)
            print(i, start_idx, end_idx)
            train_sub_dataset_list.append(
                datasets.SubDataset(train_dataset,
                                    start=start_idx,
                                    finish=end_idx,
                                    order=random_order))
        train_dataset = train_sub_dataset_list

        for i, subset in enumerate(train_dataset):
            print("subset{}: {}".format(i, len(subset)))

    # # # # validation set
    if settings["val_fold"] != -1:
        val_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((train_df[train_df["fold"] == settings["val_fold"]]
                     ["image_id"] + ".png").tolist(),
                    train_labels_arr[train_df["fold"] == settings["val_fold"],
                                     ...])),
            root=config.TRAIN_IMAGES_DIR.as_posix())
    else:
        # # if train models using all train data, calc loss for all data at the evaluation step.
        val_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((train_df["image_id"] + ".png").tolist(),
                    train_labels_arr)),
            root=config.TRAIN_IMAGES_DIR.as_posix())

    val_dataset = datasets.TransformDataset(
        val_dataset,
        nn_training.ImageTransformer(settings["inference_transforms"]))

    print("[make dataset] train: {}, val: {}".format(len(train_dataset),
                                                     len(val_dataset)))

    # # initialize model.
    model = nn_training.ImageClassificationModel(
        extractor=getattr(
            backborn_chains,
            settings["backborn_class"])(**settings["backborn_kwargs"]),
        global_pooling=None if settings["pooling_class"] is None else getattr(
            global_pooling_chains, settings["pooling_class"])(
                **settings["pooling_kwargs"]),
        classifier=getattr(classifer_chains,
                           settings["head_class"])(**settings["head_kwargs"]))
    model.name = settings["model_name"]

    # # set training wrapper.
    train_model = nn_training.CustomClassifier(
        predictor=model,
        lossfun=getattr(
            nn_training,
            settings["loss_function"][0])(**settings["loss_function"][1]),
        evalfun_dict={
            "SCE_{}".format(i): getattr(nn_training, name)(**param)
            for i, (name, param) in enumerate(settings["eval_functions"])
        })

    settings["eval_func_names"] = [
        "SCE_{}".format(i) for i in range(len(settings["eval_functions"]))
    ]

    gc.collect()
    # # training.
    # # # create trainer.
    utils.set_random_seed(settings["seed"])
    trainer = nn_training.create_trainer(settings, output_path.as_posix(),
                                         train_model, train_dataset,
                                         val_dataset)
    trainer.run()

    # # # save model of last epoch,
    model = trainer.updater.get_optimizer('main').target.predictor
    serializers.save_npz(output_path / "model_snapshot_last_epoch.npz", model)

    del trainer
    del train_model
    gc.collect()

    # # inference validation data by the model of last epoch.
    _, val_iter, _ = nn_training.create_iterator(settings, None, val_dataset,
                                                 None)
    val_pred, val_label = nn_training.inference_test_data(
        model, val_iter, gpu_device=settings["gpu_devices"][0])
    np.save(output_path / "val_pred_arr_fold{}".format(settings["val_fold"]),
            val_pred)

    # # calc validation score
    score_list = [[] for i in range(2)]

    for i in range(len(config.N_CLASSES)):
        y_pred_subset = val_pred[:, config.COMP_INDEXS[i]:config.
                                 COMP_INDEXS[i + 1]].argmax(axis=1)
        y_true_subset = val_label[:, i]
        score_list[0].append(
            recall_score(y_true_subset,
                         y_pred_subset,
                         average='macro',
                         zero_division=0))
        score_list[1].append(
            recall_score(y_true_subset,
                         y_pred_subset,
                         average='macro',
                         zero_division=1))
    score_list[0].append(np.average(score_list[0], weights=[2, 1, 1]))
    score_list[1].append(np.average(score_list[1], weights=[2, 1, 1]))

    score_df = pd.DataFrame(score_list, columns=config.COMP_NAMES + ["score"])

    print(score_df)
    score_df.to_csv(output_path / "score.csv", index=False)
Example #8
0
 def test_transform_dataset_overrun(self):
     td = datasets.TransformDataset(self.dataset, self.transform)
     with self.assertRaises(IndexError):
         td[len(td) + 1]
Example #9
0
    def permutate_mnist_aux(dataset):
        def transform(in_data):
            img, label = in_data
            return (img[indices], label)

        return datasets.TransformDataset(dataset, transform)
def inference(trained_path: PosixPath,
              output_path: PosixPath,
              epoch_of_model: int = -1,
              gpu_device: int = -1,
              batch_size: int = 64,
              inference_valid: bool = False):
    """Inference function for kernel."""
    # # read settings from training outputs directory.
    with open((trained_path / "settings.yml").as_posix(), "r") as fr:
        settings = yaml.safe_load(fr)

    # # make dataset
    # # # read meta info.
    with utils.timer("make val dataset"):
        val_dataset = test_dataset = None
        if inference_valid:
            train_df = pd.read_csv(
                config.PROC_DATA /
                "train_add-{}fold-index.csv".format(settings["n_folds"]))
            # # # # make label arr
            train_labels_arr = train_df[config.COMP_NAMES].values.astype("i")

            # # # # make chainer dataset
            val_dataset = datasets.LabeledImageDataset(
                pairs=list(
                    zip((train_df[train_df["fold"] == settings["val_fold"]]
                         ["image_id"] + ".png").tolist(), train_labels_arr[
                             train_df["fold"] == settings["val_fold"], ...])),
                root=config.TRAIN_IMAGES_DIR.as_posix())
            # # # # set transform
            val_dataset = datasets.TransformDataset(
                val_dataset,
                nn_training.ImageTransformer(settings["inference_transforms"]))

    # # # test set
    with utils.timer("make test dataset"):
        test_df = pd.read_csv(config.PROC_DATA / "test_reshaped.csv")
        sample_sub = pd.read_csv(config.RAW_DATA / "sample_submission.csv")

        # # # # make chainer dataset
        test_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((test_df["image_id"] + ".png").tolist(),
                    ([-1] * len(test_df)))),
            root=config.TEST_IMAGES_DIR.as_posix())
        # # # # set transform
        test_dataset = datasets.TransformDataset(
            test_dataset,
            nn_training.ImageTransformer(settings["inference_transforms"]))

    with utils.timer("init and load model"):
        # # initialize model.
        settings["backborn_kwargs"]["pretrained_model_path"] = None
        model = nn_training.ImageClassificationModel(
            extractor=getattr(
                backborn_chains,
                settings["backborn_class"])(**settings["backborn_kwargs"]),
            global_pooling=None if settings["pooling_class"] is None else
            getattr(global_pooling_chains, settings["pooling_class"])(
                **settings["pooling_kwargs"]),
            classifier=getattr(
                classifer_chains,
                settings["head_class"])(**settings["head_kwargs"]))
        # # load model.
        model_path = trained_path / "model_snapshot_{}.npz".format(
            epoch_of_model)
        print(model_path)
        if not (epoch_of_model != -1 and os.path.isfile(model_path)):
            model_path = trained_path / "model_snapshot_last_epoch.npz"

        print("use model: {}".format(model_path))

        serializers.load_npz(model_path, model)
        if gpu_device != -1:
            model.to_gpu(gpu_device)
        gc.collect()

    settings["batch_size"] = batch_size
    _, val_iter, test_iter = nn_training.create_iterator(
        settings, None, val_dataset, test_dataset)

    if inference_valid:
        with utils.timer("inference validation set"):
            val_pred, val_label = nn_training.inference_test_data(
                model, val_iter, gpu_device=gpu_device)
            np.save(
                output_path /
                "val_pred_arr_fold{}".format(settings["val_fold"]), val_pred)
            # # calc score
            score_list = [[] for i in range(2)]

            for i in range(len(config.N_CLASSES)):
                y_pred_subset = val_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
                y_true_subset = val_label[:, i]
                score_list[0].append(
                    recall_score(y_true_subset,
                                 y_pred_subset,
                                 average='macro',
                                 zero_division=0))
                score_list[1].append(
                    recall_score(y_true_subset,
                                 y_pred_subset,
                                 average='macro',
                                 zero_division=1))

            del val_dataset
            del val_iter
            del val_pred
            del val_label
            del y_pred_subset
            del y_true_subset

            gc.collect()
            score_list[0].append(np.average(score_list[0], weights=[2, 1, 1]))
            score_list[1].append(np.average(score_list[1], weights=[2, 1, 1]))

            score_df = pd.DataFrame(score_list,
                                    columns=config.COMP_NAMES + ["score"])

            print("[score for validation set]")
            print(score_df)
            score_df.to_csv(output_path / "score.csv", index=False)

    with utils.timer("inference test set"):
        test_pred, test_label = nn_training.inference_test_data(
            model, test_iter, gpu_device=gpu_device)
        del test_label

        np.save(
            output_path / "test_pred_arr_fold{}".format(settings["val_fold"]),
            test_pred)

    with utils.timer("make submission"):
        # # # arg max for each component.
        for i, c_name in enumerate(config.COMP_NAMES):
            test_pred_subset = test_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
            test_df[c_name] = test_pred_subset

        del test_pred
        gc.collect()

        # # # reshape test_df to submisson format.
        melt_df = pd.melt(test_df,
                          id_vars="image_id",
                          value_vars=config.COMP_NAMES,
                          value_name="target")
        melt_df["row_id"] = melt_df["image_id"] + "_" + melt_df["variable"]

        submission_df = pd.merge(sample_sub[["row_id"]],
                                 melt_df[["row_id", "target"]],
                                 on="row_id",
                                 how="left")

        submission_df.to_csv(output_path / "submission.csv", index=False)
def inference_by_snapshot_ensemble(trained_path: PosixPath,
                                   output_path: PosixPath,
                                   gpu_device: int = -1,
                                   batch_size: int = 64):
    """Inference function for kernel."""
    # # read settings from training outputs directory.
    with open((trained_path / "settings.yml").as_posix(), "r") as fr:
        settings = yaml.safe_load(fr)

    # # make dataset
    # # # test set
    with utils.timer("make test dataset"):
        test_df = pd.read_csv(config.PROC_DATA / "test_reshaped.csv")
        sample_sub = pd.read_csv(config.RAW_DATA / "sample_submission.csv")

        # # # # make chainer dataset
        test_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((test_df["image_id"] + ".png").tolist(),
                    ([-1] * len(test_df)))),
            root=config.TEST_IMAGES_DIR.as_posix())
        # # # # set transform
        test_dataset = datasets.TransformDataset(
            test_dataset,
            nn_training.ImageTransformer(settings["inference_transforms"]))

    # # # prepare model paths
    model_path_list = []
    model_weight = []
    for epoch_of_model in range(settings["epoch_per_cycle"],
                                settings["max_epoch"] + 1,
                                settings["epoch_per_cycle"]):
        model_path = trained_path / "model_snapshot_{}.npz".format(
            epoch_of_model)
        if os.path.isfile(model_path):
            model_path_list.append(model_path)
            model_weight.append(1)

    if len(model_path_list) == 0:
        model_path_list.append(trained_path / "model_snapshot_last_epoch.npz")
        model_weight.append(1)
    print("[using models]")
    print(model_path_list)

    # # # prepare preds numpy.ndarray of shape: (n_model, n_test, n_class)
    test_preds_arr = np.zeros(
        (len(model_path_list), len(test_df), sum(config.N_CLASSES)), dtype="f")

    # # inference
    with utils.timer("inference test set"):
        for idx, model_path in enumerate(model_path_list):
            # # # create iterator.
            test_iter = nn_training.create_iterator(settings, None, None,
                                                    test_dataset)[-1]
            # # # init and load model
            model = init_model(settings)
            serializers.load_npz(model_path, model)
            # # # move model to gpu
            model.to_gpu(gpu_device)
            # # # inference
            test_preds_arr[idx] = nn_training.inference_test_data(
                model, test_iter, gpu_device=gpu_device)[0]
            del test_iter
            del model
            gc.collect()
        del test_dataset

    np.save(
        output_path / "test_all_preds_arr_fold{}".format(settings["val_fold"]),
        test_preds_arr)

    # # ensemble (weighted averaging)
    with utils.timer("snapshot ensemble"):
        # # # convert logits to probs
        for i in range(len(config.N_CLASSES)):
            test_preds_arr[..., config.COMP_INDEXS[i]:config.COMP_INDEXS[i + 1]] =\
                functions.softmax(test_preds_arr[..., config.COMP_INDEXS[i]:config.COMP_INDEXS[i + 1]]).data

        test_pred = np.average(test_preds_arr, axis=0, weights=model_weight)
        np.save(
            output_path / "test_pred_arr_fold{}".format(settings["val_fold"]),
            test_pred)

    with utils.timer("make submission"):
        # # convert prob to pred id
        for i, c_name in enumerate(config.COMP_NAMES):
            test_pred_subset = test_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
            test_df[c_name] = test_pred_subset

        del test_pred_subset
        del test_pred
        gc.collect()

        # # # reshape test_df to submisson format.
        melt_df = pd.melt(test_df,
                          id_vars="image_id",
                          value_vars=config.COMP_NAMES,
                          value_name="target")
        melt_df["row_id"] = melt_df["image_id"] + "_" + melt_df["variable"]

        submission_df = pd.merge(sample_sub[["row_id"]],
                                 melt_df[["row_id", "target"]],
                                 on="row_id",
                                 how="left")

        submission_df.to_csv(output_path / "submission.csv", index=False)