def main(args):
    munich = SOMDataset.from_path(args.input)
    # bonn = SOMDataset.from_path("output/01b-create-soms/testbonn")

    # visualize tsne first
    # visualize_tsne(munich, "testmunich.png")
    # visualize_tsne(bonn, "testbonn.png")

    train, validate = munich.split(ratio=0.9, stratified=True)

    model = create_model(munich.dims, 1, global_decay=5e-3)

    model.compile(
        # loss="categorical_crossentropy",
        loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            "acc",
            # top2_acc,
        ]
    )

    binarizer = LabelBinarizer()
    binarizer.fit(["CLL", "normal"])

    trainseq = SOMSequence(train, binarizer, tube=1)
    validseq = SOMSequence(validate, binarizer, tube=1)

    model.fit_generator(
        epochs=20,
        generator=trainseq, validation_data=validseq)

    args.output.local.mkdir(parents=True, exist_ok=True)
    utils.save_joblib(binarizer, args.output / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))
Exemple #2
0
def main(data: utils.URLPath,
         meta: utils.URLPath,
         output: utils.URLPath,
         epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 2

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(
            f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    group_weights = None
    # group_count = train.group_count
    # group_weights = classification_utils.calculate_group_weights(group_count)
    # group_weights = {
    #     i: group_weights.get(g, 1.0) for i, g in enumerate(groups)
    # }

    # train = train.balance(2000)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    # always (true, pred)
    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 8,
        ("MBL", "normal"): 8,
        ("MCL", "normal"): 8,
        ("PL", "normal"): 8,
        ("LPL", "normal"): 8,
        ("MZL", "normal"): 8,
        ("FL", "normal"): 8,
        ("HCL", "normal"): 8,
    }
    if mapping:
        cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v
                        for (a, b), v in cost_mapping.items()}
    # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)
    # np.save(str(output / "cost_matrix.npy"), cost_matrix)
    cost_matrix = None

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width,
                                        z)

    binarizer, model = get_model(selected_tubes,
                                 groups=groups,
                                 global_decay=5e-5)

    if cost_matrix is not None:
        loss = classification_utils.WeightedCategoricalCrossentropy(
            cost_matrix)
    else:
        loss = "categorical_crossentropy"

    model.compile(
        loss=loss,
        # loss="categorical_crossentropy",
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            keras.metrics.CategoricalAccuracy(),
        ])
    with (output / "model_summary.txt").open("w") as summary_file:

        def print_file(*args, **kwargs):
            print(*args, **kwargs, file=summary_file)

        model.summary(print_fn=print_file)

    keras.utils.plot_model(model, to_file=str(output / "model_plot.png"))

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(train,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=32,
                           pad_width=pad_width)
    validseq = SOMSequence(validate,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=128,
                           pad_width=pad_width)

    # tensorboard_dir = str(output / "tensorboard")
    # tensorboard_callback = keras.callbacks.TensorBoard(
    #     log_dir=str(tensorboard_dir),
    #     histogram_freq=5,
    #     write_grads=True,
    #     write_images=True,
    # )
    nan_callback = keras.callbacks.TerminateOnNaN()

    history = model.fit_generator(
        epochs=epochs,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=trainseq,
        validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)

    plot_training_history(history, output / "training.png")
Exemple #3
0
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 0

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    # train = train.balance(20)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1)
    binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    xdata, ydata = sequence_to_array(trainseq)

    model.fit(xdata, ydata)

    xtest, ytest = sequence_to_array(validseq)
    pred_arr = model.predict(xtest)

    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(
        true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Exemple #4
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("2", "3", "4")
    pad_width = 1

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")])

    dataset_groups = {d.group for d in dataset}

    # if set(groups) != dataset_groups:
    #     raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    validate, train = dataset.create_split(10, stratify=True)

    group_count = train.group_count
    num_cases = sum(group_count.values())
    balanced_nums = num_cases / len(dataset_groups)
    balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups]
    min_ratio = min(balanced_loss_weights)
    balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)}
    print(balanced_loss_weights)

    # train = train.balance(2000)
    # train = train.balance_per_group({
    #     "CM": 6000,
    #     # "CLL": 4000,
    #     # "MBL": 2000,
    #     "MCL": 1000,
    #     "PL": 1000,
    #     "LPL": 1000,
    #     "MZL": 1000,
    #     "FL": 1000,
    #     "HCL": 1000,
    #     "normal": 6000,
    # })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=15, shuffle=True,
        callbacks=[tensorboard_callback, nan_callback],
        class_weight=balanced_loss_weights,
        generator=trainseq, validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups)
    print(groups)
    print(confusion)
    balanced = metrics.balanced_accuracy_score(true_labels, pred_labels)
    print(balanced)
Exemple #5
0
 def get_validation_data(
         self, dataset: som_dataset.SOMDataset) -> som_dataset.SOMDataset:
     return dataset.filter(labels=self.data_ids["validation"])