Exemple #1
0
def from_case_dataset(path):
    dataset = io_functions.load_case_collection(path)
    metadata = pd.DataFrame([{
        "label": c.id,
        "group": c.group
    } for c in dataset])
    return metadata
Exemple #2
0
def transform(data: utils.URLPath,
              meta: utils.URLPath,
              output: utils.URLPath,
              reference: utils.URLPath,
              transargs: json.loads = None,
              sample: int = 0):
    """Transform dataset using a reference SOM.

    Args:
        recreate: Delete and recreate SOMs even if they already exist.
        sample: Number of samples to transform from each group, only useful for testing purposes.
    """
    dataset = io_functions.load_case_collection(data, meta)

    # randomly sample 'sample' number cases from each group
    if sample:
        dataset = dataset.sample(sample)

    if transargs is None:
        transargs = DEFAULT_TRANSFORM_SOM_ARGS

    print(f"Loading referece from {reference}")
    model = io_functions.load_casesom(reference, **transargs)

    transform_dataset_to_som(model, dataset, output)
Exemple #3
0
def main():
    # dataset = io_functions.load_case_collection(
    #     utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"),
    #     utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz")
    # )
    dataset = io_functions.load_case_collection(
        utils.URLPath("/data/flowcat-data/paper-cytometry/unused-data"), )

    LOGGER.info("Anonymizing dataset: %s", dataset)

    OUTPUT = utils.URLPath(
        "/data/flowcat-data/paper-cytometry-resubmit/unused_data_anonymized")

    data_dir = OUTPUT / "data"
    data_dir.mkdir()

    for case in dataset:
        # if case.id != "ffc59330acb49e6fcf5e679dbabcd01e56991345":
        #     continue

        for sample in case.samples:
            old_path = sample.complete_path
            new_path = data_dir / sample.path

            LOGGER.info("Saving %s sample to %s", case.id, new_path)

            new_path.parent.mkdir()
            anon_move(str(old_path), str(new_path))
def load_datasets(data_path):
    datasets = {}
    for d in filter(lambda d: d.is_dir(), data_path.iterdir()):
        datasets[d.name] = {
            "data": io_functions.load_case_collection(d, d + ".json"),
            "config": io_functions.load_json(d + "_config.json"),
        }
    return datasets
Exemple #5
0
def create_tsne(data: utils.URLPath, meta: utils.URLPath, plotdir: utils.URLPath):
    """Generate tsne plots for a subsample of data.

    Args:
        data: Path to generated soms for cases.
        meta: Path to metadata json for cases.
        plotdir: Path to output plots for data.
    """
    # data = flowcat.utils.URLPath("output/test-2019-08/som")
    # meta = flowcat.utils.URLPath("output/test-2019-08/som.json")
    # plotdir = flowcat.utils.URLPath("output/test-2019-08/tsne")

    cases = io_functions.load_case_collection(data, meta)

    # cases = cases.sample(20, flowcat.mappings.GROUPS)
    # flowcat.io_functions.save_json(cases.labels, plotdir / "case_ids.json")
    # labels = io_functions.load_json(plotdir / "case_ids.json")
    # cases = cases.filter(labels=labels)
    # print(cases)

    groups = np.array([case.group for case in cases])

    colors = {
        "CLL": "red",
        "MBL": "dodgerblue",
        "MCL": "steelblue",
        "PL": "skyblue",
        "LPL": "limegreen",
        "MZL": "forestgreen",
        "FL": "springgreen",
        "HCL": "orchid",
        "normal": "darkgoldenrod",
    }

    plotdir.mkdir()

    for tube in ("1", "2", "3"):
        soms = []
        for case in cases:
            sample = case.get_tube(tube, kind="som")
            sample.path = data / f"{case.id}_t{tube}.npy"
            som = sample.get_data().data.flatten()
            soms.append(som)
        somdata = np.array(soms)

        tsne = manifold.TSNE(n_components=2, perplexity=10)
        transformed = tsne.fit_transform(somdata)

        fig, ax = plt.subplots(figsize=(11, 7))
        for group in mappings.GROUPS:
            gdata = transformed[groups == group]
            gx = gdata[:, 0]
            gy = gdata[:, 1]
            ax.scatter(gx, gy, c=colors[group], label=group)

        plt.legend()
        plt.savefig(plotdir / f"tsne_{tube}.png")
        plt.close("all")
Exemple #6
0
def dataset(data: utils.URLPath, meta: utils.URLPath):
    """Print information on the given dataset."""
    try:
        dataset = io_functions.load_case_collection(data, meta)
    except TypeError:
        dataset = io_functions.load_case_collection_from_caseinfo(data, meta)

    print(f"Loaded dataset from {meta}", dataset)
    print(dataset.group_count)
Exemple #7
0
def main(
    data: utils.URLPath,
    meta: utils.URLPath,
    output: utils.URLPath,
    reference_ids: utils.URLPath = None,
    reference: utils.URLPath = None,
    tensorboard_dir: utils.URLPath = None,
    modelargs: json.loads = None,
    transargs: json.loads = None,
    mode: str = "fit_transform",
):
    """
    Train a SOM and use its weights to initialize individual SOM training.

    Args:
        data: Path to fcs data.
        meta: Path to dataset metadata, this should correctly reference fcs data.
        output: Path to output model and transformed cases.
        reference_ids: Optionally list ids to be used for reference SOM generation.
        reference: Optionally use pretrained model.
        modelargs: Optionally give specific options for reference SOM generation.
        transargs: Optionally give specific options for transforming individual SOMs.
        mode: Whether to fit or to transform. Default both.
    """
    dataset = io_functions.load_case_collection(data, meta)

    if reference is None:
        reference_ids = io_functions.load_json(reference_ids)
        reference_dataset = dataset.filter(labels=reference_ids)
        print("Training reference SOM on", reference_dataset)
        reference = train_model(reference_dataset, modelargs=modelargs)
        reference_output = output / "reference"
        io_functions.save_casesom(reference, reference_output)
        reference = reference_output

    if mode == "fit":
        return

    if transargs is None:
        transargs = {
            "max_epochs": 4,
            "batch_size": 50000,
            "initial_radius": 4,
            "end_radius": 1,
        }

    model = io_functions.load_casesom(reference,
                                      tensorboard_dir=tensorboard_dir,
                                      **transargs)

    som_output = output / "som"
    transform_cases(dataset, model, som_output)
Exemple #8
0
def main(
    fcsdata: utils.URLPath,
    fcsmeta: utils.URLPath,
    somdata: utils.URLPath,
    output: utils.URLPath,
):

    fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta)
    try:
        som_config = io_functions.load_json(somdata + "_config.json")
    except FileNotFoundError:
        som_config = None

    if som_config is None:
        selected_markers = fcs_dataset.selected_markers
    else:
        selected_markers = {t: d["channels"] for t, d in som_config.items()}

    tubes = ("1", "2", "3")

    model = quantization_error_model()
    sess = tf.Session()
    results = []
    for fcscase in fcs_dataset:
        print(fcscase)
        for tube in tubes:
            fcssample = fcscase.get_tube(tube, kind="fcs").get_data()
            somsample = get_som_data(fcscase.id, tube, somdata,
                                     selected_markers[tube])
            error = sample_quantization_error(fcssample, somsample, model,
                                              sess)
            results.append((fcscase.id, tube, error))

    stats = {}
    stats["mean"] = {
        t: sum(r[-1] for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    stats["variance"] = {
        t: sum(
            np.power(r[-1] - stats["mean"][t], 2)
            for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    print("Mean quantization error", stats)

    io_functions.save_json(results, output / "quantization_error.json")
    io_functions.save_json(stats, output / "quantization_error_mean.json")
Exemple #9
0
def main(
        data: utils.URLPath = None,
        model: utils.URLPath = None,
        preds: utils.URLPath = None,
        output: utils.URLPath = None,
):
    data = utils.URLPath("/data/flowcat-data/paper-cytometry/som/unused")
    dataset = io_functions.load_case_collection(data, data + ".json.gz")
    # output = utils.URLPath("/data/flowcat-data/paper-cytometry/tsne")
    output = utils.URLPath("teststuff_unused_style")
    output.mkdir()

    # predictions = io_functions.load_json(utils.URLPath("/data/flowcat-data/paper-cytometry/tsne/prediction.json"))
    model = SOMClassifier.load(utils.URLPath("/data/flowcat-data/paper-cytometry/classifier"))

    som_tsne(dataset, model, output)
Exemple #10
0
def main(args):
    """Load case ids from json file to filter cases and train and save the created model."""
    output_dir = args.output

    dataset = io_functions.load_case_collection(args.data, args.meta)

    selected_labels = io_functions.load_json(args.cases)
    selected, _ = dataset.filter_reasons(labels=selected_labels)

    if args.tensorboard:
        tensorboard_dir = output_dir / "tensorboard"
    else:
        tensorboard_dir = None

    model = train_model(selected,
                        markers=args.markers,
                        tensorboard=tensorboard_dir,
                        marker_name_only=args.marker_name_only)

    io_functions.save_casesom(model, output_dir)
Exemple #11
0
def filter(
    data: utils.URLPath,
    filters: json.loads,
    output: utils.URLPath = None,
    meta: utils.URLPath = None,
    sample: int = 0,
    move_samples: bool = False,
):
    """Filter data on the given filters and output resulting dataset metadata
    to destination.

    Args:
        data: Path to fcs data.
        meta: Path to dataset metadata.
        output: Path to output for metadata.
        filters: Filters for individual cases.
        sample: Number of cases per group.
        move_samples: Destination will also include sample data.
    """
    print(f"Loading existing dataset from {data} with metadata in {meta}")
    try:
        dataset = io_functions.load_case_collection(data, meta)
    except TypeError:
        dataset = io_functions.load_case_collection_from_caseinfo(data, meta)

    dataset = dataset.filter(**filters)
    if sample:
        dataset = dataset.sample(sample)

    print(f"Filtering down to {dataset}")
    print(dataset.group_count)

    if output:
        print("Saving", dataset, f"to {output}")
        if move_samples:
            io_functions.save_case_collection_with_data(dataset, output)
        else:
            io_functions.save_case_collection(dataset, output)
def main(args):
    """Load a model with given transforming arguments and transform individual
    cases."""
    cases = io_functions.load_case_collection(args.data, args.meta)
    # cases = cases.sample(1, groups=["CLL", "normal"])
    selected_markers = cases.selected_markers
    marker_name_only = False

    if args.tensorboard:
        tensorboard_dir = args.output / "tensorboard"
    else:
        tensorboard_dir = None

    # scaler = "RefitMinMaxScaler"
    scaler = args.scaler
    # Training parameters for the model can be respecified, the only difference
    # between transform and normal traninig, is that after a transformation is
    # completed, the original weights will be restored to the model.
    model = casesom.CaseSom(
        tubes=selected_markers,
        tensorboard_dir=tensorboard_dir,
        modelargs={
            "marker_name_only": marker_name_only,
            "max_epochs": 5,
            "batch_size": 50000,
            "initial_radius": int(args.size / 2),
            "end_radius": 1,
            "radius_cooling": "linear",
            # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY,
            "map_type": "toroid",
            "dims": (args.size, args.size, -1),
            "scaler": scaler,
        }
    )

    transform_cases(cases, model, args.output)
Exemple #13
0
def load_matching_som_dataset(
        fcs_dataset: "CaseCollection",
        som_dataset_path: utils.URLPath) -> "CaseCollection":
    """Check whether the given som path contains a complete SOM dataset matching the given FCS dataset.

    Otherwise return None.
    """
    try:
        som_dataset = io_functions.load_case_collection(som_dataset_path)
    except Exception as e:
        LOGGER.warning("Loading existing dataset at %s produced error: %s",
                       som_dataset_path, e)
        return None

    same_case_number = len(fcs_dataset) == len(som_dataset)
    same_sample_count = len([1 for c in fcs_dataset
                             for s in c.samples]) == len(
                                 [1 for c in som_dataset for s in c.samples])
    if not (same_case_number and same_sample_count):
        LOGGER.warning(
            "Existing som dataset at %s does not match number of samples or cases of given FCS dataset",
            som_dataset_path)
        return None
    return som_dataset
Exemple #14
0
def load_case_collection(data: str, meta: str = None):
    data = utils.URLPath(data)
    if meta is not None:
        meta = utils.URLPath(meta)

    return io_functions.load_case_collection(data, meta)
Exemple #15
0
def main(data: utils.URLPath, reference: utils.URLPath, output: utils.URLPath):
    """
    """
    cases = io_functions.load_case_collection(data, data / data.name + ".json")
    default_settings = {
        "max_epochs": 4,
        "initial_learning_rate": 0.05,
        "end_learning_rate": 0.01,
        "batch_size": 50000,
        "initial_radius": 4,
        "end_radius": 1,
    }
    # settings = [
    #     ("learning_rate_001_0001", {"initial_learning_rate": 0.01, "end_learning_rate": 0.001}),
    #     ("learning_rate_001_001", {"initial_learning_rate": 0.01, "end_learning_rate": 0.01}),
    #     ("learning_rate_005_0001", {"initial_learning_rate": 0.05, "end_learning_rate": 0.001}),
    #     ("learning_rate_005_001", {"initial_learning_rate": 0.05, "end_learning_rate": 0.01}),
    #     ("learning_rate_005_005", {"initial_learning_rate": 0.05, "end_learning_rate": 0.05}),
    #     ("learning_rate_05_0001", {"initial_learning_rate": 0.5, "end_learning_rate": 0.001}),
    #     ("learning_rate_05_001", {"initial_learning_rate": 0.5, "end_learning_rate": 0.01}),
    #     ("learning_rate_05_01", {"initial_learning_rate": 0.5, "end_learning_rate": 0.1}),
    #     ("learning_rate_05_05", {"initial_learning_rate": 0.5, "end_learning_rate": 0.5}),
    # ]
    settings = [
        ("radius_24_1", {
            "initial_radius": 24,
            "end_radius": 1
        }),
        ("radius_24_2", {
            "initial_radius": 24,
            "end_radius": 2
        }),
        ("radius_24_1", {
            "initial_radius": 16,
            "end_radius": 1
        }),
        ("radius_16_2", {
            "initial_radius": 16,
            "end_radius": 2
        }),
        ("radius_8_1", {
            "initial_radius": 8,
            "end_radius": 1
        }),
        ("radius_8_2", {
            "initial_radius": 8,
            "end_radius": 2
        }),
        ("radius_4_1", {
            "initial_radius": 4,
            "end_radius": 1
        }),
        ("radius_4_2", {
            "initial_radius": 4,
            "end_radius": 2
        }),
    ]
    for name, setting in settings:
        model = io_functions.load_casesom(
            reference,
            **{
                **default_settings,
                **setting
            },
        )
        transform_data(cases, model, output / name)
Exemple #16
0
def lenient_load_collection(data, meta):
    try:
        dataset = io_functions.load_case_collection(data, meta)
    except TypeError:
        dataset = io_functions.load_case_collection_from_caseinfo(data, meta)
    return dataset
Exemple #17
0
"""
Acquire FCS information needed for Miflowcyt document.

Also roughly check whether we have strongly diverging data in our dataset.
"""
from flowcat import dataset as fc_dataset, io_functions, utils
import fcsparser


def section(text, level=4, deco="#"):
    deco_text = deco * level
    section_text = f"{deco_text} {text} {deco_text}"
    print(section_text)


train_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz"))
test_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz"))

print("Loading all data used in paper analysis.")
dataset = train_dataset + test_dataset
print(dataset)

section("Get info for case 0")
case = dataset[0]
print(case)

sample = case.samples[0]
meta, data = fcsparser.parse(sample.complete_path)
for i in range(1, 13):
    name = f"$P{i}S"
    voltage = f"$P{i}V"
Exemple #18
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to fcs dataset data
        meta: Path to fcs dataset metainformation
        output: Output path
    """
    tubes = ("1", "2")
    sample_size = 512
    # group_mapping = mappings.GROUP_MAPS["6class"]
    # mapping = group_mapping["map"]
    mapping = None
    groups = mappings.GROUPS
    # groups = group_mapping["groups"]

    dataset = io_functions.load_case_collection(data, meta)
    if mapping:
        dataset = dataset.map_groups(mapping)
    dataset = dataset.filter(groups=groups)

    validate, train = dataset.create_split(50)
    print(train.group_count)
    # train = train.balance(1000).shuffle()
    train = train.sample(100).shuffle()
    print(train.group_count)

    group_count = train.group_count
    group_weights = classification_utils.calculate_group_weights(group_count)
    group_weights = {
        i: group_weights.get(g, 1.0)
        for i, g in enumerate(groups)
    }

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    train_seq = FCSSequence(train,
                            binarizer,
                            tubes=tubes,
                            sample_size=sample_size,
                            batch_size=64)
    validate_seq = FCSSequence(validate,
                               binarizer,
                               tubes=tubes,
                               sample_size=sample_size,
                               batch_size=128)

    config = {
        "tubes": tubes,
        "groups": groups,
    }
    io_functions.save_json(config, output / "config.json")

    # for tube in tubes:
    #     x, y, z = selected_tubes[tube]["dims"]
    #     selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 2,
        ("MBL", "normal"): 2,
        ("MCL", "normal"): 2,
        ("PL", "normal"): 2,
        ("LPL", "normal"): 2,
        ("MZL", "normal"): 2,
        ("FL", "normal"): 2,
        ("HCL", "normal"): 2,
    }
    cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)

    model = create_fcs_model(train_seq.xshape,
                             train_seq.yshape,
                             global_decay=5e-5)
    model.compile(
        # loss="categorical_crossentropy",
        # loss=keras.losses.CategoricalCrossentropy(),
        loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix),
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            "acc",
            # keras.metrics.CategoricalAccuracy(),
            # keras.metrics.TopKCategoricalAccuracy(k=2),
            # top2_acc,
        ])
    model.summary()

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=20,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=train_seq,
        validation_data=validate_seq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validate_seq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validate_seq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Exemple #19
0
def load_datasets():
    berlin_dataset = io_functions.load_case_collection(**BERLIN_DATA)
    munich_dataset = io_functions.load_case_collection(**MUNICH_DATA)
    return berlin_dataset, munich_dataset
Exemple #20
0
# pylint: skip-file
# flake8: noqa
import numpy as np
import pandas as pd

from flowcat.utils import URLPath
from flowcat.io_functions import load_case_collection
from flowcat.dataset import case_dataset

datapath = URLPath("output/test-2019-08/som")
metapath = URLPath("output/test-2019-08/som.json")
cases = load_case_collection(datapath, metapath)
print(cases)

nppath = URLPath("output/test-2019-08/somnp2")
nppath.mkdir()
all_num = len(cases)
for i, case in enumerate(cases):
    print(f"Converting {i}/{all_num}")
    for somsample in case.samples:
        somsample.path = datapath / f"{case.id}_t{somsample.tube}.csv"
        somdata = pd.read_csv(str(somsample.path), index_col=0)
        somarray = somdata.values
        somarray = somarray.reshape((32, 32, -1))
        newpath = nppath / f"{case.id}_t{somsample.tube}.npy"
        np.save(str(newpath), somarray)
def main(data: utils.URLPath, model: utils.URLPath, output: utils.URLPath):
    dataset = io_functions.load_case_collection(data, data + ".json")
    dataset.set_data_path(utils.URLPath(""))

    model = SOMClassifier.load(model)
    validate = model.get_validation_data(dataset)
    val_seq = model.create_sequence(validate)

    trues = np.concatenate([val_seq[i][1] for i in range(len(val_seq))])
    preds = np.array([p for p in model.model.predict_generator(val_seq)])

    create_roc_results(trues, preds, output / "roc", model)
    create_threshold_results(trues, preds, output / "threshold", model)

    # tsne of result vectors
    embedding_path = output / "embedding-preds"
    embedding_path.mkdir()

    pred_labels = val_seq.true_labels
    groups = model.config["groups"]
    groups.remove("normal")
    groups = ["normal", *groups]
    all_groups = groups + ["AML", "MM", "HCLv"]
    colors = sns.cubehelix_palette(len(all_groups), rot=4, dark=0.30)
    perplexity = 50

    # tsne of intermediate layers
    intermediate_model = keras.Model(
        inputs=model.model.input,
        outputs=model.model.get_layer("concatenate_1").output)
    intermed_preds = np.array(
        [p for p in intermediate_model.predict_generator(val_seq)])

    # unknown data
    udata = utils.URLPath("output/unknown-cohorts-processing/som/som")
    udataset = io_functions.load_case_collection(udata, udata + ".json")
    udataset.set_data_path(utils.URLPath(""))
    un_seq = model.create_sequence(udataset)
    intermed_upreds = np.array(
        [p for p in intermediate_model.predict_generator(un_seq)])

    all_intermed = np.concatenate((intermed_preds, intermed_upreds))
    all_labels = pred_labels + un_seq.true_labels

    umap_inter_all = UMAP(n_neighbors=30).fit_transform(all_intermed)
    plot_embedded(umap_inter_all, all_labels, all_groups,
                  colors=colors).savefig(str(embedding_path /
                                             f"umap_intermediate_all.png"),
                                         dpi=300)

    tsne_inter_all = manifold.TSNE(
        perplexity=perplexity).fit_transform(all_intermed)
    plot_embedded(
        tsne_inter_all, all_labels, all_groups, colors=colors).savefig(str(
            embedding_path / f"tsne_intermediate_all_p{perplexity}.png"),
                                                                       dpi=300)

    # create som tsne for known and unknown data
    all_cases = validate.cases + udataset.cases

    case_data = []
    for case in all_cases:
        somdata = np.concatenate([
            case.get_tube(tube, kind="som").get_data().data
            for tube in model.config["tubes"]
        ],
                                 axis=2).flatten()
        case_data.append(somdata)
    case_data = np.array(case_data)

    perplexity = 50
    umap_som_all = UMAP(n_neighbors=30).fit_transform(case_data)
    plot_embedded(umap_som_all, all_labels, all_groups, colors=colors).savefig(
        str(embedding_path / f"umap_som_all.png"), dpi=300)

    tsne_som_all = manifold.TSNE(
        perplexity=perplexity).fit_transform(case_data)
    plot_embedded(tsne_som_all, all_labels, all_groups, colors=colors).savefig(
        str(embedding_path / f"tsne_som_all_p{perplexity}.png"), dpi=300)

    # plot legend
    fig = plt.figure()
    patches = [
        mpl.patches.Patch(color=color, label=group)
        for group, color in zip(all_groups, colors)
    ]
    fig.legend(patches, all_groups, loc='center', frameon=False)
    fig.savefig(str(embedding_path / "legend.png"), dpi=300)
Exemple #22
0
def read_sel_markers(selected_markers) -> "Dict[(Marker, str), float]":
    """read selcted markers from a file and convert to Marker object"""
    markers = list(selected_markers.values())[0]
    marker_names = []
    for marker in markers:
        marker_names.append(
            Marker(antibody=Marker.name_to_marker(marker).antibody,
                   color=None))
    selected_markers = {"1": marker_names}
    return selected_markers


dataset = io_functions.load_case_collection(
    utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F"),
    utils.URLPath(
        "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/train.json.gz"
    ))

references = io_functions.load_json(
    utils.URLPath(
        "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json"
    ))

OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F")

setup_logging(None, "generate ref SOM for merged FCS")

ref_dataset = dataset.filter(labels=references)

tensorboard_dir = None
Exemple #23
0
from fcg_logging import create_logging_handlers, setup_logging

LOGPATH = utils.URLPath("logs/assess_quality_{utils.create_stamp()}.log")
LOGGER = setup_logging(LOGPATH, "assess_quality")

ungated_samples = list(utils.URLPath("output/ungated/data").glob("**/*.LMD"))
ungated_sample_count = len(ungated_samples)
gated_samples = list(
    utils.URLPath("output/gated_single/data").glob("**/*.LMD"))
gated_sample_count = len(gated_samples)

LOGGER.info("Gated/Ungated successful FCS count: %d/%d (%s %%)",
            gated_sample_count, ungated_sample_count,
            gated_sample_count / ungated_sample_count)

sample_dataset = io_functions.load_case_collection(
    utils.URLPath("output/samples"))

LOGGER.info(sample_dataset)


def foldername(path):
    return str(utils.URLPath(path.parent.name, path.name))


def ppp(v):
    LOGGER.info(v)
    return v


gated_samples_names = list(map(lambda p: foldername(p), gated_samples))
missing_paths = list(
from flowcat import io_functions, utils
from flowcat.plots import som as fc_somplot

LOGPATH = utils.URLPath("logs/visualize_datasets_{utils.create_stamp()}.log")

LOGGER = utils.logs.setup_logging(LOGPATH, "visualize_datasets")
OUTPUT = utils.URLPath("output/visualization/soms-ungated")

# OUTPUT.mkdir()
# 
# som_dataset = io_functions.load_case_collection(utils.URLPath("output/classifier_ungated/som"))
# 
# # testsample = som_dataset[0].samples[0]
# 
# for case in som_dataset.filter(groups=["CLL"]):
#     testsample = case.get_tube("1", kind="som")
#     LOGGER.info(testsample)
#     somdata = testsample.get_data()
#     fig = fc_somplot.plot_som_grid(somdata, channels=["SS INT LIN", "CD45-KrOr", None])
#     fig.savefig(str(OUTPUT / f"test_{case.id}.png"))

OUTPUT = utils.URLPath("output/visualization/soms-original")
som_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/paper-cytometry/som/train"), utils.URLPath("/data/flowcat-data/paper-cytometry/som/train.json.gz"))
OUTPUT.mkdir()
for case in som_dataset.filter(groups=["CLL"]):
    testsample = case.get_tube("1", kind="som")
    LOGGER.info(testsample)
    somdata = testsample.get_data()
    fig = fc_somplot.plot_som_grid(somdata, channels=["SS INT LIN", "CD45-KrOr", None])
    fig.savefig(str(OUTPUT / f"test_{case.id}.png"))
from flowcat import io_functions, utils, seed as fc_seed

INPUT = {
    "data":
    utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"),
    "meta":
    utils.URLPath(
        "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz"
    ),
    "meta_test":
    utils.URLPath(
        "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz"
    ),
}

train_dataset = io_functions.load_case_collection(INPUT["data"],
                                                  INPUT["meta_test"])
sorted_cases = sorted(train_dataset,
                      key=lambda c: c.infiltration
                      if c.infiltration > 0.0 else 1000)

perc01_count = 0
group_count = defaultdict(int)
for case in sorted_cases[:100]:
    print("Minimal infiltration sample:", case, case.infiltration)
    if case.infiltration == 0.1:
        perc01_count += 1
        group_count[case.group] += 1

print(perc01_count)
print(group_count)
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath, model: utils.URLPath):
    data, meta, soms, model = map(utils.URLPath, [
        "/data/flowcat-data/mll-flowdata/decCLL-9F",
        "output/0-final-dataset/train.json.gz",
        "output/som-fix-test/soms-test/som_r4_1",
        "output/0-final/classifier-minmax-new",
    ])
    sommodel = utils.URLPath("output/som-fix-test/unjoined-ref")
    sommodel = io_functions.load_casesom(sommodel)

    output = utils.URLPath("output/0-final/model-analysis/saliency")
    output.mkdir()
    dataset = io_functions.load_case_collection(data, meta)
    soms = som_dataset.SOMDataset.from_path(soms)
    model = SaliencySOMClassifier.load(model)
    val_dataset = model.get_validation_data(dataset)
    val_seq = model.create_sequence(soms)

    selected_labels = [
        "c3a6098bd5216c7d1f958396dd31bd6ef1646c18",
        "df726c162ed728c2886107e665ad931e5bf0baae",
        "3eb03bea6651c302ac013f187b288ee990889b29",
        "e539b3ec66b1c9d7a0aae1fbd37c19c7ac86a18c",
        "762a2a19d1913383f41ead7b5ef74a8133d67847",
        "bbfafb3d9053e212279aaada5faf23eddf4a5926",
        "9503bfad60524615a06613cfbffa3861fb66ede3",
    ]
    sel_dataset = dataset.filter(labels=selected_labels)

    # annotate each fcs point with saliency info
    session = tf.Session()
    bmu_calc = calculate_bmu_indexes()

    normalize = mpl.colors.Normalize(vmin=0, vmax=1)

    case = sel_dataset[0]
    for case in sel_dataset:
        case_output = output / f"{case.id}_g{case.group}"
        case_output.mkdir()
        print("Plotting", case)

        # plot som and saliency activations
        result = model.calculate_saliency(val_seq, case, case.group, maximization=False)

        xdata, _ = val_seq.get_batch_by_label([case.id])
        xdata = [x[0, ...] for x in xdata]

        for tube in ("1", "2", "3"):
            fig = plot_saliency_som_map(model, xdata, result, tube, ("CD45-KrOr", "SS INT LIN", "CD19-APCA750"))
            fig.savefig(str(case_output / f"t{tube}_overlay.png"))

            fig = plot_saliency_scatterplot(model, bmu_calc, session, case, tube, xdata, result, norm=normalize)
            fig.savefig(str(case_output / f"t{tube}_scatter_saliency.png"))

    for case in sel_dataset:
        case_output = output / f"maxall_{case.id}_g{case.group}"
        case_output.mkdir()
        print("Plotting", case)

        # plot som and saliency activations
        result = model.calculate_saliency(val_seq, case, case.group, maximization=False)
        for r in result:
            print("Max", np.max(r))

        xdata, _ = val_seq.get_batch_by_label([case.id])
        xdata = [x[0, ...] for x in xdata]

        for tube in ("1", "2", "3"):
            fig = plot_saliency_som_map(model, xdata, result, tube, ("CD45-KrOr", "SS INT LIN", "CD19-APCA750"))
            fig.savefig(str(case_output / f"t{tube}_overlay.png"))

            fig = plot_saliency_scatterplot(model, bmu_calc, session, case, tube, xdata, result, norm=normalize)
            fig.savefig(str(case_output / f"t{tube}_scatter_saliency.png"))

    # case_som = soms.get_labels([case.id]).iloc[0]
    hcls = val_dataset.filter(groups=["HCL"])
    from collections import defaultdict
    max_vals = defaultdict(lambda: defaultdict(list))
    mean_vals = defaultdict(lambda: defaultdict(list))
    for case in hcls:
        print(case)
        gradient = model.calculate_saliency(val_seq, case, case.group, maximization=False)
        for i, (tube, markers) in enumerate(model.config["tubes"].items()):
            tgrad = gradient[i]
            for i, marker in enumerate(markers["channels"]):
                mgrad = tgrad[:, :, i]
                gmax = np.max(mgrad)
                max_vals[tube][marker].append(gmax)
                gmean = np.mean(mgrad)
                mean_vals[tube][marker].append(gmean)
    max_markers = defaultdict(list)
    for tube, markers in model.config["tubes"].items():
        for marker in markers["channels"]:
            print("Max", tube, marker, np.mean(max_vals[tube][marker]))
            print("Mean", tube, marker, np.mean(mean_vals[tube][marker]))
            max_markers[tube].append((marker, np.mean(max_vals[tube][marker])))
Exemple #27
0
            (
                fcs_data[tube],
                flowsom_data[tube],
                flowcat_data[tube],
            ),
            (
                {"s": 1, "marker": ".", "color": "grey", "label": "fcs"},
                {"s": 8, "marker": ".", "color": "blue", "label": "flowCat", "alpha": 0.5},
                {"s": 8, "marker": ".", "color": "red", "label": "flowSOM", "alpha": 0.5},
            ),
            tube,
            output / name)


cases = io_functions.load_case_collection(
    utils.URLPath("output/4-flowsom-cmp/samples"),
    utils.URLPath("output/4-flowsom-cmp/samples/samples.json"))

# Compare flowsom results with flowcat results. Do keep in mind that they are
# scaled differently

# flowsom_path = utils.URLPath("output/4-flowsom-cmp/flowsom-samples")
# flowcat_path = utils.URLPath("output/4-flowsom-cmp/flowcat-denovo")
# flowcat_ref_path = flowcat.utils.URLPath("output/4-flowsom-cmp/flowcat-refsom")

output = utils.URLPath("output/4-flowsom-cmp/figures-refit")
tubes = ("1", "2", "3")
groups = set(cases.groups)

from collections import defaultdict
Exemple #28
0
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath,
         model: utils.URLPath):
    data, meta, soms, model = map(utils.URLPath, [
        "/data/flowcat-data/mll-flowdata/decCLL-9F",
        "output/0-final-dataset/train.json.gz",
        "output/som-fix-test/soms-test/som_r4_1",
        "output/0-final/classifier-minmax-new",
    ])
    dataset = io_functions.load_case_collection(data, meta)
    soms = som_dataset.SOMDataset.from_path(soms)
    model = SaliencySOMClassifier.load(model)
    val_dataset = model.get_validation_data(dataset)
    val_seq = model.create_sequence(soms)

    # printing out weights and biases, unsure whether they actually contain
    # information
    # in theory we could extend that to attempt to describe them as gates
    tube = "3"
    weights, biases = model.model.layers[int(tube) + 2].get_weights()
    for j, chname in enumerate(model.config["tubes"][tube]["channels"]):
        ch_mean_weight = np.mean(weights[:, :, j, :])
        print(j, chname, ch_mean_weight)

    for i in range(weights.shape[-1]):
        mean_weight = np.mean(weights[:, :, :, i])
        print(i, mean_weight, biases[i])
        for j, chname in enumerate(model.config["tubes"]["1"]["channels"]):
            print(i, j, chname)
            print(weights[:, :, j, i])

    # zero out specific columns and see how that impacts performance
    output = utils.URLPath("output/0-final/model-analysis/occlusion")
    for group in model.config["groups"]:
        print(group)
        sel_cases = val_dataset.filter(groups=[group])
        avg_results = model.channel_occlusion(sel_cases, val_seq)
        print(sorted(avg_results, key=lambda t: t[2], reverse=True))
        io_functions.save_json(avg_results, output / f"{group}_avg_std.json")

    # case_som = soms.get_labels([case.id]).iloc[0]
    hcls = val_dataset.filter(groups=["HCL"])
    from collections import defaultdict
    max_vals = defaultdict(lambda: defaultdict(list))
    mean_vals = defaultdict(lambda: defaultdict(list))
    for case in hcls:
        print(case)
        gradient = model.calculate_saliency(val_seq,
                                            case,
                                            case.group,
                                            maximization=False)
        for i, (tube, markers) in enumerate(model.config["tubes"].items()):
            tgrad = gradient[i]
            for i, marker in enumerate(markers["channels"]):
                mgrad = tgrad[:, :, i]
                gmax = np.max(mgrad)
                max_vals[tube][marker].append(gmax)
                gmean = np.mean(mgrad)
                mean_vals[tube][marker].append(gmean)
    max_markers = defaultdict(list)
    for tube, markers in model.config["tubes"].items():
        for marker in markers["channels"]:
            print("Max", tube, marker, np.mean(max_vals[tube][marker]))
            print("Mean", tube, marker, np.mean(mean_vals[tube][marker]))
            max_markers[tube].append((marker, np.mean(max_vals[tube][marker])))

    for tube in model.config["tubes"]:
        print("Tube", tube)
        print("\n".join(": ".join((t[0], str(t[1]))) for t in sorted(
            max_markers[tube], key=lambda t: t[1], reverse=True)))

    c_model = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/model_0.h5"
    c_labels = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/test_labels.json"
    c_preds = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/predictions_0.csv"
    c_config = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/config.json"
    c_cases = MLLDATA / "mll-flowdata/CLL-9F"
    c_sommaps = MLLDATA / "mll-sommaps/sample_maps/selected1_toroid_s32"
    c_misclass = MLLDATA / "mll-sommaps/misclassifications/"
    c_tube = [1, 2]

    # load datasets
    somdataset = sd.SOMDataset.from_path(c_sommaps)
    cases = cc.CaseCollection.from_path(c_cases, how="case_info.json")

    # filter datasets
    test_labels = flowutils.load_json(c_labels)

    filtered_cases = cases.filter(labels=test_labels)
    somdataset.data[1] = somdataset.data[1].loc[test_labels, :]

    # get mapping
    config = flowutils.load_json(c_config)
    groupinfo = mappings.GROUP_MAPS[config["c_groupmap"]]

    dataset = cd.CombinedDataset(filtered_cases, {
        dd.Dataset.from_str('SOM'): somdataset,
        dd.Dataset.from_str('FCS'): filtered_cases
    },
                                 group_names=groupinfo['groups'])

    # modify mapping
    dataset.set_mapping(groupinfo)

    xoutputs = [
        loaders.loader_builder(
            loaders.Map2DLoader.create_inferred,
            tube=1,
            sel_count="counts",
            pad_width=1,
        ),
        loaders.loader_builder(
            loaders.Map2DLoader.create_inferred,
            tube=2,
            sel_count="counts",
            pad_width=1,
        )
    ]

    dataset = loaders.DatasetSequence.from_data(dataset,
                                                xoutputs,
                                                batch_size=1,
                                                draw_method="sequential")

    predictions = pd.read_csv(c_preds, index_col=0)

    predictions = add_correct_magnitude(predictions)
    predictions = add_infiltration(predictions, cases)

    misclass_labels = ['507777582649cbed8dfb3fe552a6f34f8b6c28e3']

    for label in misclass_labels:
        label_path = pathlib.Path(f"{c_misclass}/{label}")
        if not label_path.exists():
            label_path.mkdir()

        case = cases.get_label(label)

        #get the actual and the predicited class
        corr_group = predictions.loc[case.id, "correct"]
        pred_group = predictions.loc[case.id, "pred"]
        classes = [corr_group, pred_group]

        gradients = plotting.calc_saliency(dataset,
                                           case,
                                           c_model,
                                           classes=classes)

        for tube in c_tube:

            heatmaps = plotting.draw_saliency_heatmap(case, gradients, classes,
                                                      tube)
            for idx, heatmap in enumerate(heatmaps):
                plotting.save_figure(
                    heatmap,
                    f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_saliency_heatmap.png"
                )

            scatterplots = plotting.plot_tube(case,
                                              tube,
                                              gradients[tube - 1],
                                              classes=classes,
                                              sommappath=c_sommaps)
            for idx, scatterplot in enumerate(scatterplots):
                plotting.save_figure(
                    scatterplot,
                    f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_scatterplots.png"
                )