Exemple #1
0
def main(args):
    output_dir = args.output / args.name

    dataset = flowcat.CaseCollection.from_path(args.input, metapath=args.meta)
    selected_labels = io_functions.load_json("data/selected_cases.json")
    selected, _ = dataset.filter_reasons(labels=selected_labels)
    selected = dataset.sample(count=1, groups=["CLL", "normal"])
    print(selected.labels)

    joined_tubes = io_functions.load_json(
        "output/00-dataset-test/munich_bonn_tubes.json")
    print(joined_tubes)

    # TODO: Generate a SOM for all tubes for the given labels.
    # Visualize using tensorboard
    # Save everything into a single, folder which we can use in the next script
    # to create single SOMs
    model = som.CaseSingleSom(tube=1,
                              materials=flowcat.ALLOWED_MATERIALS,
                              markers=joined_tubes["1"],
                              marker_name_only=True,
                              max_epochs=10,
                              batch_size=10000,
                              marker_images=som.fcssom.MARKER_IMAGES_NAME_ONLY,
                              map_type="toroid",
                              tensorboard_dir=output_dir / "tensorboard",
                              dims=(32, 32, -1))
    model.train(selected)
    model.save(output_dir / "model")
Exemple #2
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output
    PANEL = args.panel
    bal = args.bal

    # set the groups according to the panel
    if PANEL == "MLL":
        groups = GROUPS
    elif PANEL == "ERLANGEN":
        groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    tubes = ("1")
    mapping = None

    balance = dict((key, bal) for key in groups)

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
            "train_epochs": 20,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    model = fc_api.train_som_classifier(train_dataset, validate_dataset,
                                        config)

    model.save(OUTPUT)
    model.save_information(OUTPUT)
Exemple #3
0
    def load(cls, path: utils.URLPath):
        """Load classifier model from the given path."""
        config = io_functions.load_json(path / "config.json")

        model = keras.models.load_model(str(path / "model.h5"), )

        binarizer = io_functions.load_joblib(path / "binarizer.joblib")

        data_ids = {
            "validation": io_functions.load_json(path / "ids_validate.json"),
            "train": io_functions.load_json(path / "ids_train.json"),
        }
        return cls(model, binarizer, config, data_ids=data_ids)
Exemple #4
0
def main():
    bonn_config = io_functions.load_json("output/00-dataset-test/bonn_config.json")
    munich_config = io_functions.load_json("output/00-dataset-test/train_config.json")

    selected = {}
    for tube, markers in bonn_config["selected_markers"].items():
        selected[tube] = []
        munich_tube = [remove_stem(m) for m in munich_config["selected_markers"][tube]]
        for marker in markers:
            marker_stem = remove_stem(marker)
            if marker_stem in munich_tube:
                selected[tube].append(marker_stem)
    print(selected)

    io_functions.save_json(selected, "output/00-dataset-test/munich_bonn_tubes.json")
def load_datasets(data_path):
    datasets = {}
    for d in filter(lambda d: d.is_dir(), data_path.iterdir()):
        datasets[d.name] = {
            "data": io_functions.load_case_collection(d, d + ".json"),
            "config": io_functions.load_json(d + "_config.json"),
        }
    return datasets
Exemple #6
0
def load_flowcat_data(case_id, flowcat_path, tubes):
    """Load given flowcat data into a dict of pandas dataframes."""
    soms = {}
    config = io_functions.load_json(flowcat_path + "_config.json")
    for tube in tubes:
        sompath = flowcat_path / f"{case_id}_t{tube}.npy"
        channels = config[tube]["channels"]
        soms[tube] = pd.DataFrame(np.load(sompath).reshape((-1, len(channels))), columns=channels)
    return soms
Exemple #7
0
def main(
    data: utils.URLPath,
    meta: utils.URLPath,
    output: utils.URLPath,
    reference_ids: utils.URLPath = None,
    reference: utils.URLPath = None,
    tensorboard_dir: utils.URLPath = None,
    modelargs: json.loads = None,
    transargs: json.loads = None,
    mode: str = "fit_transform",
):
    """
    Train a SOM and use its weights to initialize individual SOM training.

    Args:
        data: Path to fcs data.
        meta: Path to dataset metadata, this should correctly reference fcs data.
        output: Path to output model and transformed cases.
        reference_ids: Optionally list ids to be used for reference SOM generation.
        reference: Optionally use pretrained model.
        modelargs: Optionally give specific options for reference SOM generation.
        transargs: Optionally give specific options for transforming individual SOMs.
        mode: Whether to fit or to transform. Default both.
    """
    dataset = io_functions.load_case_collection(data, meta)

    if reference is None:
        reference_ids = io_functions.load_json(reference_ids)
        reference_dataset = dataset.filter(labels=reference_ids)
        print("Training reference SOM on", reference_dataset)
        reference = train_model(reference_dataset, modelargs=modelargs)
        reference_output = output / "reference"
        io_functions.save_casesom(reference, reference_output)
        reference = reference_output

    if mode == "fit":
        return

    if transargs is None:
        transargs = {
            "max_epochs": 4,
            "batch_size": 50000,
            "initial_radius": 4,
            "end_radius": 1,
        }

    model = io_functions.load_casesom(reference,
                                      tensorboard_dir=tensorboard_dir,
                                      **transargs)

    som_output = output / "som"
    transform_cases(dataset, model, som_output)
Exemple #8
0
def main(
    fcsdata: utils.URLPath,
    fcsmeta: utils.URLPath,
    somdata: utils.URLPath,
    output: utils.URLPath,
):

    fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta)
    try:
        som_config = io_functions.load_json(somdata + "_config.json")
    except FileNotFoundError:
        som_config = None

    if som_config is None:
        selected_markers = fcs_dataset.selected_markers
    else:
        selected_markers = {t: d["channels"] for t, d in som_config.items()}

    tubes = ("1", "2", "3")

    model = quantization_error_model()
    sess = tf.Session()
    results = []
    for fcscase in fcs_dataset:
        print(fcscase)
        for tube in tubes:
            fcssample = fcscase.get_tube(tube, kind="fcs").get_data()
            somsample = get_som_data(fcscase.id, tube, somdata,
                                     selected_markers[tube])
            error = sample_quantization_error(fcssample, somsample, model,
                                              sess)
            results.append((fcscase.id, tube, error))

    stats = {}
    stats["mean"] = {
        t: sum(r[-1] for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    stats["variance"] = {
        t: sum(
            np.power(r[-1] - stats["mean"][t], 2)
            for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    print("Mean quantization error", stats)

    io_functions.save_json(results, output / "quantization_error.json")
    io_functions.save_json(stats, output / "quantization_error_mean.json")
Exemple #9
0
def predict(
    data: utils.URLPath,
    model: utils.URLPath,
    output: utils.URLPath,
    labels: utils.URLPath = None,
    metrics: bool = True,
):
    """Generate predictions and plots for a single case.

    Args:
        data: SOM dataset.
        model: Path to model containing CNN and SOMs.
        output: Destination for plotting.
        labels: List of case ids to be filtered for generating predictions.
    """
    print(f"Loaded cases from {data}")
    dataset = som_dataset.SOMDataset.from_path(data)
    if labels:
        labels = io_functions.load_json(labels)
        dataset = dataset.filter(labels=labels)

    model = classifier.SOMClassifier.load(model)
    data_sequence = model.create_sequence(dataset, 128)

    values, pred_labels = model.predict_generator(data_sequence)

    pred_json = {
        id: dict(zip(model.config.groups, value.tolist()))
        for id, value in zip(dataset.labels, values)
    }

    io_functions.save_json(pred_json, output / "prediction.json")

    if metrics:
        true_labels = data_sequence.true_labels
        map_config = [("unmapped", {
            "groups": model.config.groups,
            "map": {}
        }), *GROUP_MAPS.items()]
        for map_name, mapping in map_config:
            print(f"--- MAPPING: {map_name} ---")
            if len(mapping["groups"]) > len(model.config.groups):
                continue
            fc_predictions.generate_all_metrics(true_labels, pred_labels,
                                                mapping, output / map_name)
Exemple #10
0
def main(args):
    """Load case ids from json file to filter cases and train and save the created model."""
    output_dir = args.output

    dataset = io_functions.load_case_collection(args.data, args.meta)

    selected_labels = io_functions.load_json(args.cases)
    selected, _ = dataset.filter_reasons(labels=selected_labels)

    if args.tensorboard:
        tensorboard_dir = output_dir / "tensorboard"
    else:
        tensorboard_dir = None

    model = train_model(selected,
                        markers=args.markers,
                        tensorboard=tensorboard_dir,
                        marker_name_only=args.marker_name_only)

    io_functions.save_casesom(model, output_dir)
Exemple #11
0
    def from_path(cls, path):
        """
        Loads a SOM dataset with the following organization:
        dataset/
            config.json  # contains info on used markers
            meta.json.gz*
            data/  # contains .npy SOMs
        dataset.csv*

        * either csv file with metadata (old format) or a meta.json.gz (casecollection variant, new data)
        """
        config = io_functions.load_json(path / "config.json")
        try:
            metadata = io_functions.load_csv(path + ".csv")
        except FileNotFoundError:
            metadata = from_case_dataset(path)
        tubes = list(config.keys())
        data_path = path / "data"
        som_cases = metadata.apply(load_som_cases,
                                   axis=1,
                                   args=(data_path, tubes))
        return cls(data=som_cases, config=config)
Exemple #12
0
def train_model(
    dataset,
    markers=None,
    tensorboard=None,
    modelargs=None,
) -> sommodels.casesom.CaseSom:
    """Create and train a SOM model using the given dataset."""
    if modelargs is None:
        modelargs = {
            "marker_name_only": False,
            "max_epochs": 10,
            "batch_size": 50000,
            "initial_radius": 16,
            "end_radius": 2,
            "radius_cooling": "linear",
            # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY,
            "map_type": "toroid",
            "dims": (32, 32, -1),
            "scaler": "MinMaxScaler",
        }

    if markers:
        selected_markers = io_functions.load_json(markers)
    else:
        selected_markers = dataset.selected_markers
        # modify marker names if marker_name_only
        if modelargs.get("marker_name_only", False):
            selected_markers = {
                tube: [extract_name(marker) for marker in markers]
                for tube, markers in selected_markers.items()
            }

    model = sommodels.casesom.CaseSom(
        tubes=selected_markers,
        tensorboard_dir=tensorboard,
        modelargs=modelargs,
    )
    model.train(dataset)
    return model
Exemple #13
0
def train_model(dataset,
                markers=None,
                size=32,
                scaler="RefitStandardScaler",
                tensorboard=None,
                marker_name_only=False):
    """Create and train a SOM model using the given dataset"""
    if markers:
        selected_markers = io_functions.load_json(markers)
    else:
        selected_markers = dataset.selected_markers
        # modify marker names if marker_name_only
        if marker_name_only:
            selected_markers = {
                tube: [extract_name(marker) for marker in markers]
                for tube, markers in selected_markers.items()
            }

    # scaler = "StandardScaler"
    # scaler = "RefitStandardScaler"
    # scaler = "MinMaxScaler"

    model = sommodels.casesom.CaseSom(
        tubes=selected_markers,
        tensorboard_dir=tensorboard,
        modelargs={
            "marker_name_only": marker_name_only,
            "max_epochs": 10,
            "batch_size": 50000,
            "initial_radius": int(size / 2),
            "end_radius": 2,
            "radius_cooling": "linear",
            # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY,
            "map_type": "toroid",
            "dims": (size, size, -1),
            "scaler": scaler,
        })
    model.train(dataset)
    return model
 def json_results(self):
     print(self.path)
     return io_functions.load_json(self.path / "preds" / "validation_metrics.json")
Exemple #15
0
def main(data: utils.URLPath,
         meta: utils.URLPath,
         output: utils.URLPath,
         epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 2

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(
            f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    group_weights = None
    # group_count = train.group_count
    # group_weights = classification_utils.calculate_group_weights(group_count)
    # group_weights = {
    #     i: group_weights.get(g, 1.0) for i, g in enumerate(groups)
    # }

    # train = train.balance(2000)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    # always (true, pred)
    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 8,
        ("MBL", "normal"): 8,
        ("MCL", "normal"): 8,
        ("PL", "normal"): 8,
        ("LPL", "normal"): 8,
        ("MZL", "normal"): 8,
        ("FL", "normal"): 8,
        ("HCL", "normal"): 8,
    }
    if mapping:
        cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v
                        for (a, b), v in cost_mapping.items()}
    # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)
    # np.save(str(output / "cost_matrix.npy"), cost_matrix)
    cost_matrix = None

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width,
                                        z)

    binarizer, model = get_model(selected_tubes,
                                 groups=groups,
                                 global_decay=5e-5)

    if cost_matrix is not None:
        loss = classification_utils.WeightedCategoricalCrossentropy(
            cost_matrix)
    else:
        loss = "categorical_crossentropy"

    model.compile(
        loss=loss,
        # loss="categorical_crossentropy",
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            keras.metrics.CategoricalAccuracy(),
        ])
    with (output / "model_summary.txt").open("w") as summary_file:

        def print_file(*args, **kwargs):
            print(*args, **kwargs, file=summary_file)

        model.summary(print_fn=print_file)

    keras.utils.plot_model(model, to_file=str(output / "model_plot.png"))

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(train,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=32,
                           pad_width=pad_width)
    validseq = SOMSequence(validate,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=128,
                           pad_width=pad_width)

    # tensorboard_dir = str(output / "tensorboard")
    # tensorboard_callback = keras.callbacks.TensorBoard(
    #     log_dir=str(tensorboard_dir),
    #     histogram_freq=5,
    #     write_grads=True,
    #     write_images=True,
    # )
    nan_callback = keras.callbacks.TerminateOnNaN()

    history = model.fit_generator(
        epochs=epochs,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=trainseq,
        validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)

    plot_training_history(history, output / "training.png")
Exemple #16
0
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 0

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    # train = train.balance(20)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1)
    binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    xdata, ydata = sequence_to_array(trainseq)

    model.fit(xdata, ydata)

    xtest, ytest = sequence_to_array(validseq)
    pred_arr = model.predict(xtest)

    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(
        true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Exemple #17
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    binarizer, model = load_model(args.model)

    trainseq = som_dataset.SOMSequence(train_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)

    model.fit_generator(generator=trainseq,
                        epochs=10,
                        validation_data=validseq)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels,
                           OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
Exemple #18
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("2", "3", "4")
    pad_width = 1

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")])

    dataset_groups = {d.group for d in dataset}

    # if set(groups) != dataset_groups:
    #     raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    validate, train = dataset.create_split(10, stratify=True)

    group_count = train.group_count
    num_cases = sum(group_count.values())
    balanced_nums = num_cases / len(dataset_groups)
    balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups]
    min_ratio = min(balanced_loss_weights)
    balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)}
    print(balanced_loss_weights)

    # train = train.balance(2000)
    # train = train.balance_per_group({
    #     "CM": 6000,
    #     # "CLL": 4000,
    #     # "MBL": 2000,
    #     "MCL": 1000,
    #     "PL": 1000,
    #     "LPL": 1000,
    #     "MZL": 1000,
    #     "FL": 1000,
    #     "HCL": 1000,
    #     "normal": 6000,
    # })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=15, shuffle=True,
        callbacks=[tensorboard_callback, nan_callback],
        class_weight=balanced_loss_weights,
        generator=trainseq, validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups)
    print(groups)
    print(confusion)
    balanced = metrics.balanced_accuracy_score(true_labels, pred_labels)
    print(balanced)
Exemple #19
0
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pandas as pd
from flowcat import utils, io_functions

input_data = {
    p.name: io_functions.load_json(p / "quantization_error.json")
    for p in map(utils.URLPath, [
        "output/4-flowsom-cmp/quantization-error/flowsom-10",
        "output/4-flowsom-cmp/quantization-error/flowcat-refit-s10",
        "output/4-flowsom-cmp/quantization-error/flowsom-32",
        "output/4-flowsom-cmp/quantization-error/flowcat-refit-s32",
    ])
}

input_data = [{
    "dataset": k,
    "id": label,
    "tube": tube,
    "qe": value,
    "algo": k.split("-")[0],
    "size": int(k.split("-")[-1].lstrip("s")),
} for k, vv in input_data.items() for label, tube, value in vv]

data = pd.DataFrame(input_data)

sns.set_style("white")
Exemple #20
0
    for marker in markers:
        marker_names.append(
            Marker(antibody=Marker.name_to_marker(marker).antibody,
                   color=None))
    selected_markers = {"1": marker_names}
    return selected_markers


dataset = io_functions.load_case_collection(
    utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F"),
    utils.URLPath(
        "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/train.json.gz"
    ))

references = io_functions.load_json(
    utils.URLPath(
        "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json"
    ))

OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F")

setup_logging(None, "generate ref SOM for merged FCS")

ref_dataset = dataset.filter(labels=references)

tensorboard_dir = None

# Discover channels in the given dataset
markers = get_tube_marker(ref_dataset)
# markers = read_sel_markers(sel_markers)
print(markers)
Exemple #21
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output
    PANEL = args.panel
    basemodel = args.basemodel
    bal = args.bal

    # set the groups according to the panel
    if PANEL == "MLL":
        groups = GROUPS
    elif PANEL == "ERLANGEN":
        groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    tubes = ("1")
    mapping = None

    balance = dict((key, bal) for key in groups)

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
            "train_epochs": 15,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    # load base model and get weights
    base_model = models.load_model(str(basemodel / "model.h5"))
    weights = base_model.get_weights()

    # create model
    model = create_model(config.inputs, config.output)

    model.set_weights(weights)

    # freeze 2 dense layers: check for each dataset
    model.get_layer('dense_1').trainable = False
    model.get_layer('dense_2').trainable = False

    model.compile(loss=config.get_loss(modeldir=None),
                  optimizer="adam",
                  metrics=["accuracy"])

    # cast to SOMConfig instance
    model = SOMClassifier(config, model)

    train = model.create_sequence(train_dataset, config.train_batch_size)

    if validate_dataset is not None:
        validate = model.create_sequence(validate_dataset,
                                         config.valid_batch_size)
    else:
        validate = None

    model.train_generator(train,
                          validate,
                          epochs=config.train_epochs,
                          class_weight=None)

    model.save(OUTPUT)
    model.save_information(OUTPUT)
Exemple #22
0
# flake8: noqa
"""Create plots for channel occlusion data."""
import numpy as np
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("white")
from flowcat import io_functions, utils, mappings

data = utils.URLPath("output/0-final/model-analysis/occlusion")
output = utils.URLPath("output/0-final/model-analysis/occlusion/plots")
output.mkdir()

group_data = [(p.name.split("_")[0], io_functions.load_json(p))
              for p in data.glob("*.json")]

group_tubes = [
    (group, tube, np.mean([t[2] for t in gdata if t[0] == tube]),
     np.sqrt(np.mean([np.power(t[3], 2) for t in gdata if t[0] == tube])))
    for group, gdata in group_data for tube in ("1", "2", "3")
]
colors = sns.color_palette("Blues")

pos = np.arange(len(mappings.GROUPS))

fig, ax = plt.subplots()
ax.bar(
    [
        pos[mappings.GROUPS.index(g)] + (int(t) - 2) * 0.2
Exemple #23
0
def load_somclassifier_config(path: utils.URLPath) -> "SOMClassifierConfig":
    """Load somclassifier config from the given path."""
    return SOMClassifierConfig(**io_functions.load_json(path))
Exemple #24
0
def plot_channel_densities(tube: str, channels: List[str],
                           output: utils.URLPath):
    """Plot the channel densities for a given dataset.

    Args:
        tube: Tube for which intensities should be generated.
        channels: List of channels used in generation.
        output: Output directory of plots.
    """
    berlin_dataset, munich_dataset = load_datasets()

    # berlin_sample = berlin_dataset.sample(10)
    # groups = list(berlin_sample.group_count.keys())

    # munich_sample = munich_dataset.sample(10, groups=groups)

    output = utils.URLPath("output/50-berlin_dataset/plot_channel_densities")

    # io_functions.save_json(berlin_sample.labels, output / "berlin_sample_labels.json")
    # io_functions.save_json(munich_sample.labels, output / "munich_sample_labels.json")

    berlin_sample_ids = io_functions.load_json(output /
                                               "berlin_sample_labels.json")
    munich_sample_ids = io_functions.load_json(output /
                                               "munich_sample_labels.json")
    berlin_sample = berlin_dataset.filter(labels=berlin_sample_ids)
    munich_sample = munich_dataset.filter(labels=munich_sample_ids)

    berlin_markers = berlin_sample.selected_markers

    from collections import defaultdict
    # find best match for each munich tube
    for tube, markers in munich_sample.selected_markers.items():
        counts = defaultdict(int)
        for btube, bmarkers in berlin_markers.items():
            bmarkers_name_only = [m.split()[0] for m in bmarkers]
            for marker in markers:
                marker = marker.replace("-", " ")
                mname = marker.split()[0]
                if marker in bmarkers:
                    print(btube, marker)
                    counts[btube] += 1
                elif mname in bmarkers_name_only:
                    print(btube, mname)
        print(counts)

    tube = "1"
    channels = ("CD45-KrOr", "SS INT LIN")

    output = utils.URLPath(
        "output/50-berlin_dataset/plot_channel_densities/plots")
    output.mkdir()

    sns.set_style("white")

    # create hex bin plot
    for name, dataset in (("berlin", berlin_sample), ("munich",
                                                      munich_sample)):
        for group in ("normal", "MCL", "CLL", "FL", "LPL", "MZL", "HCL"):
            datas_x, datas_y = data_to_channels(dataset, tube, channels)
            data_x = pd.concat(datas_x).reset_index(drop=True, inplace=False)
            data_y = pd.concat(datas_y).reset_index(drop=True, inplace=False)

            plt.figure()
            sns.jointplot(data_x, data_y, kind="hex")
            plt.savefig(str(output / f"hex_{name}_{group}.png"))
            plt.close("all")

    # create kde plot in one dimension
    group = "CLL"
    channel = "CD19 ECD"
    btubes = ("2", "3", "4")
    berlin_gsample = [c for c in berlin_sample if c.group == group]
    berlin_ts = [(tube,
                  pd.concat(data_to_channel(berlin_gsample, tube,
                                            channel)).reset_index(drop=True))
                 for tube in btubes]

    mchannel = "CD19-APCA750"
    mtubes = ("1", "2", "3")
    munich_gsample = [c for c in munich_sample if c.group == group]
    munich_ts = [(tube,
                  pd.concat(data_to_channel(munich_gsample, tube,
                                            mchannel)).reset_index(drop=True))
                 for tube in mtubes]

    fig, ax = plt.subplots()
    for tube, berlin_t in berlin_ts:
        sns.kdeplot(berlin_t, ax=ax, color="blue", label=f"Berlin {tube}")
    for tube, munich_t in munich_ts:
        sns.kdeplot(munich_t, ax=ax, color="red", label=f"Munich {tube}")
    fig.suptitle(f"{group} {channel} {mchannel}")
    fig.savefig(str(output / f"kde_munich_berlin_{group}_CD19.png"))

    # create kde plot after rescaling
    group = "CLL"
    channel = "Kappa FITC"
    btubes = ("2")
    berlin_gsample = [c for c in berlin_sample if c.group == group]
    berlin_ts = []
    for tube in btubes:
        datas = data_to_channel(berlin_gsample, tube, channel)
        transformed = []
        for data in datas:
            data = data.values
            data = data.reshape(-1, 1).astype("float32")
            tf = preprocessing.StandardScaler().fit_transform(data)
            transformed.append(tf.flatten())
        merged = np.concatenate(transformed)
        berlin_ts.append((tube, merged))

    mchannel = "Kappa-FITC"
    mtubes = ("2")
    munich_gsample = [c for c in munich_sample if c.group == group]
    munich_ts = []
    for tube in mtubes:
        datas = data_to_channel(munich_gsample, tube, mchannel)
        transformed = []
        for data in datas:
            data = data.values
            data = data.reshape(-1, 1).astype("float32")
            tf = preprocessing.StandardScaler().fit_transform(data)
            transformed.append(tf.flatten())
        merged = np.concatenate(transformed)
        munich_ts.append((tube, merged))

    fig, ax = plt.subplots()
    for tube, berlin_t in berlin_ts:
        sns.kdeplot(berlin_t, ax=ax, color="blue", label=f"Berlin {tube}")
    for tube, munich_t in munich_ts:
        sns.kdeplot(munich_t, ax=ax, color="red", label=f"Munich {tube}")
    fig.suptitle(f"{group} {channel} {mchannel}")
    fig.savefig(str(output / f"standard_kde_munich_berlin_{group}_kappa.png"))