Ejemplo n.º 1
0
def full_predict(df, model_class, model_params, general_params):
    with open(os.path.join(general_params["logdir"], "vectorizer.pickle"),
              "rb") as input_file:
        vectorizer = pickle.load(input_file)

    df = make_df(df, vectorizer)
    ds = GeneralDataset(
        df["tokens"].values,
        labels=None,
        max_sentence_len=general_params["max_sentence_len"],
    )
    dl = DataLoader(
        dataset=ds,
        batch_size=general_params["batch_size"],
        shuffle=False,
        num_workers=general_params["num_workers"],
    )

    model_params = copy.deepcopy(model_params)
    model_params.update({"vocab_size": len(vectorizer.vocabulary_)})
    model = model_class(**model_params).float()

    runner = SupervisedRunner(model=model)
    runner_out = runner.predict_loader(
        loader=dl,
        resume=os.path.join(general_params["logdir"], "checkpoints",
                            general_params["checkpoint_name"]),
    )

    y_pred = []

    for pred in runner_out:
        pred = pred[runner.output_key].cpu().numpy()

        for p in pred:
            y_pred.append(np.array(p))

    return np.array(y_pred)
def main(train, test, features, target):
    # get args
    args = parse_arguments()
    params = yaml_to_json(args.yaml_path)

    # hyper param
    num_folds = params.fold
    seed = params.seed
    base_path = params.base_path
    target_cols = params.target
    features_cols = params.features
    preprocessed_data_path = params.preprocessed_data
    batch_size = params.batch_size
    num_epochs = params.epochs
    # ex) '/hoge/logs'
    base_logdir = params.base_logdir

    # fix seed
    set_global_seed(seed)
    device = get_device()

    # set up logdir
    now = datetime.now()
    base_logdir = os.path.join(base_logdir + now.strftime("%Y%m%d%H%M%S"))
    os.makedirs(base_logdir, exist_ok=True)
    # dump yaml contents
    with open(os.path.join(base_logdir, 'params.json'), mode="w") as f:
        json.dump(params, f, indent=4)
    # dump this scripts
    my_file_path = os.path.abspath(__file__)
    shutil.copyfile(my_file_path, base_logdir)

    # load dataset
    if preprocessed_data_path == '':
        train, test, sample_submission = read_data(base_path)  # noqa
        # TODO: You should implement these function!!
        train, test = preprocess(train, test)  # noqa
        train, test = build_feature(train, test)  # noqa
    else:
        train = pd.read_csv(preprocessed_data_path + 'train.csv')
        test = pd.read_csv(preprocessed_data_path + 'test.csv')
        sample_submission = pd.read_csv(preprocessed_data_path +
                                        'sample_submission.csv')

    # execute CV
    # TODO: set your CV method
    kf = KFold(n_splits=num_folds, random_state=seed)
    ids = kf.split(train)
    fold_scores = []
    test_preds = []
    for fold, (train_idx, valid_idx) in enumerate(ids):
        print('Fold {}'.format(fold + 1))

        logdir = os.path.join(base_logdir + 'fold_{}'.format(fold + 1))
        os.makedirs(logdir, exist_ok=True)

        # data
        X_train = train[features_cols]
        # 目的変数の正規化は...?
        Y_train = train[target_cols]
        X_test = train[features_cols]

        # create dataloaders
        train_dls, test_dl = create_data_loader(
            X_train.iloc[train_idx].to_numpy(),
            Y_train.iloc[train_idx].to_numpy(),
            X_train.iloc[valid_idx].to_numpy(),
            Y_train.iloc[valid_idx].to_numpy(),
            X_test.to_numpy(),
            batch_size=batch_size)

        # init models
        # TODO: set your model and learning condition
        # ここは関数を用意して、キーワードで取り出すようにできると汎用性は上がる
        model = SampleNN(input_dim=1000, out_dim=1)
        criterion = nn.BCELoss()
        optimizer = torch.optim.AdamW(model.parameters())
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

        # init catalyst runner
        runner = SupervisedRunner(device=device)
        # model training
        runner.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            loaders=train_dls,
            logdir=logdir,
            num_epochs=num_epochs,
            callbacks=[EarlyStoppingCallback(patience=15, min_delta=0)],
            verbose=False)

        # calculate valid score
        best_model_path = logdir + '/checkpoints/best.pth'
        val_preds = runner.predict_loader(model,
                                          train_dls['valid'],
                                          resume=best_model_path,
                                          verbose=False)
        val_truth = Y_train.iloc[valid_idx].values
        # TODO: set your score function
        cv_score = mean_spearmanr_correlation_score(val_truth, val_preds)
        print('Fold {} CV score : {}'.format(fold + 1, cv_score))
        fold_scores.append(cv_score)

        # test prediction
        test_pred = runner.predict_loader(
            model, test_dl, resume=best_model_path, verbose=False) / num_folds
        test_preds.append(test_pred)

    # submit
    # TODO: set your submit process
    sample_submission[target_cols] = np.mean(test_preds, axis=0)
    sample_submission.to_csv('submission.csv')
    return True
Ejemplo n.º 3
0
                         shuffle=False,
                         drop_last=True,
                         num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(
        list(
            map(
                lambda x: x["logits"].cpu().numpy(),
                runner.predict_loader(
                    model=model,
                    loader=test_dl,
                    resume=f"{logdir}/model/nonecrop/chickpea_lentils.pth"))))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        # true_classes.append(class_names[truth])
Ejemplo n.º 4
0
X, y = torch.rand(num_samples, num_features), torch.rand(num_samples)
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, num_workers=1)
loaders = {"train": loader, "valid": loader}

# model, criterion, optimizer, scheduler
model = torch.nn.Linear(num_features, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [3, 6])

runner = SupervisedRunner()
# model training
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    logdir="./logdir",
    num_epochs=8,
    verbose=True,
    check=True,
    load_best_on_end=True,
)
# model inference
for prediction in runner.predict_loader(loader=loader):
    assert prediction["logits"].cpu().detach().numpy().shape == (32, 1)
# model tracing
traced_model = runner.trace(loader=loader)
TEST_IMAGES = sorted(test_image_path.glob("*.png"))

# create test dataset
test_dataset = SegmentationDataset(TEST_IMAGES, transforms=valid_transforms)

num_workers: int = 4

infer_loader = DataLoader(test_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=num_workers)

# this get predictions for the whole loader
predictions = runner.predict_loader(
    model=model,
    loader=infer_loader,
    resume=f"{logdir}/checkpoints/best.pth",
    verbose=False,
)

print(type(predictions))
print(predictions.shape)

# In[22]:

threshold = 0.5
max_count = 5

for i, (features, logits) in enumerate(zip(test_dataset, predictions)):
    image = utils.tensor_to_ndimage(features["image"])

    mask_ = torch.from_numpy(logits[0]).sigmoid()
Ejemplo n.º 6
0
def main():
    # Enable argument parsing for file paths
    args = vars(get_args())

    train_images_path = args["train_images"]
    train_masks_path = args["train_masks"]
    test_images_path = args["test_images"]
    test_masks_path = args["test_masks"]

    # print out yaml file configuration
    dir_path = os.path.dirname(os.path.realpath(__file__))
    yaml_path = os.path.join(dir_path, "config/igvc.yaml")
    ARCH = yaml.safe_load(open(yaml_path, "r"))

    # Set a seed for reproducibility
    utils.set_global_seed(ARCH["train"]["seed"])
    utils.prepare_cudnn(deterministic=ARCH["train"]["cudnn"])

    # Set up U-Net with pretrained EfficientNet backbone
    model = smp.Unet(
        encoder_name=ARCH["encoder"]["name"],
        encoder_weights=ARCH["encoder"]["weight"],
        classes=ARCH["train"]["classes"],
        activation=ARCH["encoder"]["activation"],
    )

    # Get Torch loaders
    loaders = get_loaders(
        images=np.load(train_images_path),
        masks=np.load(train_masks_path),
        image_arr_path=train_images_path,
        mask_arr_path=train_masks_path,
        random_state=ARCH["train"]["random_state"],
        valid_size=ARCH["train"]["valid_size"],
        batch_size=ARCH["train"]["batch_size"],
        num_workers=ARCH["train"]["num_workers"],
    )

    # Optimize for cross entropy using Adam
    criterion = {
        "CE": CrossentropyND(),
    }

    optimizer = AdamW(
        model.parameters(),
        lr=ARCH["train"]["lr"],
        betas=(ARCH["train"]["betas_min"], ARCH["train"]["betas_max"]),
        eps=float(ARCH["train"]["eps"]),
        weight_decay=ARCH["train"]["w_decay"],
        amsgrad=ARCH["train"]["amsgrad"],
    )

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=ARCH["train"]["optim_factor"],
        patience=ARCH["train"]["optim_patience"],
    )

    device = utils.get_device()
    print("Using device: {}".format(device))
    print(f"torch: {torch.__version__}, catalyst: {catalyst.__version__}")

    runner = SupervisedRunner(device=device,
                              input_key="image",
                              input_target_key="mask")

    # Use Catalyst callbacks for metric calculations during training
    callbacks = [
        CriterionCallback(input_key="mask", prefix="loss", criterion_key="CE"),
        MulticlassDiceMetricCallback(input_key="mask"),
    ]

    # Train and print model training logs
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=callbacks,
        logdir=ARCH["train"]["logdir"],
        num_epochs=ARCH["train"]["epochs"],
        main_metric="loss",
        minimize_metric=ARCH["train"]["minimize_metric"],
        fp16=ARCH["train"]["fp16"],
        verbose=ARCH["train"]["verbose"],
    )

    # Test model on test dataset
    test_data = SegmentationDataset(test_images_path, test_masks_path)
    infer_loader = DataLoader(
        test_data,
        batch_size=ARCH["test"]["batch_size"],
        shuffle=ARCH["test"]["shuffle"],
        num_workers=ARCH["test"]["num_workers"],
    )

    # Get model predictions on test dataset
    predictions = np.vstack(
        list(
            map(
                lambda x: x["logits"].cpu().numpy(),
                runner.predict_loader(
                    loader=infer_loader,
                    resume=f"content/full_model2/checkpoints/best.pth",
                ),
            )))

    save_result(predictions, test_data)
Ejemplo n.º 7
0
# create test dataset
test_dataset = SegmentationDataset(TEST_IMAGES, transforms=valid_transforms)

num_workers: int = 4

infer_loader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
)

# this get predictions for the whole loader
predictions = np.vstack(
    list(
        map(
            lambda x: x["logits"].cpu().numpy(),
            runner.predict_loader(
                loader=infer_loader, resume=f"{logdir}/checkpoints/best.pth"
            ),
        )
    )
)

print(type(predictions))
print(predictions.shape)

threshold = 0.5
max_count = 5

for i, (features, logits) in enumerate(zip(test_dataset, predictions)):
    image = utils.tensor_to_ndimage(features["image"])

    mask_ = torch.from_numpy(logits[0]).sigmoid()
Ejemplo n.º 8
0

X_train, y_train = get_Xy('train')
X_valid, y_valid = get_Xy('valid')
X_test, y_test = get_Xy('test')
train_loader = get_loader(X_train, y_train)
valid_loader = get_loader(X_valid, y_valid)
loaders = {"train": train_loader, "valid": valid_loader}

model = nn.Linear(X_train.size()[1], 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
runner = SupervisedRunner()

runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    loaders=loaders,
    logdir="./logdir",
    callbacks=[AccuracyCallback(num_classes=4, accuracy_args=[1])],
    num_epochs=10,
    verbose=True,
)

test_loader = get_loader(X_test, y_test)
logits = runner.predict_loader(model=model, loader=test_loader, verbose=True)

y_pred = torch.max(torch.from_numpy(logits), dim=1)[1]
print(y_pred[:10])
Ejemplo n.º 9
0
    #         1, 2]), EarlyStoppingCallback(metric='accuracy01', minimize=False, patience=10)]
    # )

    # # model inference
    test_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE,
                         shuffle=False, drop_last=True, num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(list(map(
        lambda x: x["logits"].cpu().numpy(),
        runner.predict_loader(model=model,
                              loader=test_dl, resume=f"{logdir}/model/partition/evi_full.pth")
    )))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        true_classes.append(class_names[truth])
Ejemplo n.º 10
0
                         shuffle=False,
                         drop_last=True,
                         num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(
        list(
            map(
                lambda x: x["logits"].cpu().numpy(),
                runner.predict_loader(model=model,
                                      loader=test_dl,
                                      resume=f"{logdir}/model/vic_198.pth"))))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        true_classes.append(class_names[truth])
Ejemplo n.º 11
0
    #         1, 2]), EarlyStoppingCallback(metric='accuracy01', minimize=False, patience=10)]
    # )

    # # model inference
    test_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE,
                         shuffle=False, drop_last=True, num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(list(map(
        lambda x: x["logits"].cpu().numpy(),
        runner.predict_loader(model=model,
                              loader=test_dl, resume=f"{logdir}/model/181920/2_evi_hue.pth")
    )))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        true_classes.append(class_names[truth])
Ejemplo n.º 12
0
                         shuffle=False,
                         drop_last=True,
                         num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(
        list(
            map(
                lambda x: x["logits"].cpu().numpy(),
                runner.predict_loader(
                    model=model,
                    loader=test_dl,
                    resume=f"{logdir}/best_full_2019_210.pth"))))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        true_classes.append(class_names[truth])
Ejemplo n.º 13
0
    #         1, 2]), EarlyStoppingCallback(metric='accuracy01', minimize=False, patience=10)]
    # )

    # # model inference
    test_dl = DataLoader(train_ds, batch_size=BATCH_SIZE,
                         shuffle=False, drop_last=True, num_workers=0)

    test_truth = []
    for i in test_dl:
        test_truth.append(i[1].cpu().numpy().tolist())

    test_truth = [item for sublist in test_truth for item in sublist]

    predictions = np.vstack(list(map(
        lambda x: x["logits"].cpu().numpy(),
        runner.predict_loader(model=model,
                              loader=test_dl, resume=f"{logdir}/model/3_all.pth")
    )))

    probabilities = []
    pred_labels = []
    true_labels = []
    pred_classes = []
    true_classes = []
    for i, (truth, logits) in enumerate(zip(test_truth, predictions)):
        probability = torch.softmax(torch.from_numpy(logits), dim=0)
        pred_label = probability.argmax().item()
        probabilities.append(probability.cpu().numpy())
        pred_labels.append(pred_label)
        true_labels.append(truth)
        pred_classes.append(class_names[pred_label])
        true_classes.append(class_names[truth])
Ejemplo n.º 14
0
)

# Test model on test dataset
test_data = SegmentationDataset(test_images_path, test_masks_path)
infer_loader = DataLoader(test_data,
                          batch_size=12,
                          shuffle=False,
                          num_workers=4)

# Get model predictions on test dataset
predictions = np.vstack(
    list(
        map(
            lambda x: x["logits"].cpu().numpy(),
            runner.predict_loader(
                loader=infer_loader,
                resume=f"content/full_model2/checkpoints/best.pth"),
        )))

# Pick sample images to analyze results
low = 1
high = len(predictions) - 1
num_results = 30
num_rand_results = num_results - 2
rand_nums = np.random.randint(low, high, num_rand_results)
"""
The results specifically include images 30 and 141 as
they are images containing multiple lines and barrels.
Hence, they would be strong indicators of performance.
"""
rand_nums = np.insert(rand_nums, 0, 30)