Esempio n. 1
0
def create_roc_results(trues, preds, output, model):
    """Create ROC and AUC metrics and save them to the given directory."""
    output.mkdir()
    curves = {}
    for i, group in enumerate(model.config["groups"]):
        curves[group] = metrics.roc_curve(trues[:, i], preds[:, i])

    auc = {}
    for i, group in enumerate(model.config["groups"]):
        auc[group] = metrics.roc_auc_score(trues[:, i], preds[:, i])

    macro_auc = metrics.roc_auc_score(trues, preds, average="macro")
    micro_auc = metrics.roc_auc_score(trues, preds, average="micro")
    io_functions.save_json(
        {
            "one-vs-rest": auc,
            "macro": macro_auc,
            "micro": micro_auc,
        }, output / "auc.json")

    fig, ax = plt.subplots()
    for name, curve in curves.items():
        ax.plot(curve[0], curve[1], label=name)

    ax.plot((0, 1), (0, 1), "k--")
    ax.legend()
    ax.set_xlabel("False positive rate")
    ax.set_ylabel("True positive rate")
    ax.set_title("ROC one-vs-rest")

    fig.tight_layout()
    fig.savefig(str(output / "roc.png"), dpi=300)
    plt.close()
Esempio n. 2
0
def transform_data(dataset, model, output):
    output.mkdir()

    casesamples = defaultdict(list)
    for case, somsample in utils.time_generator_logger(
            model.transform_generator(dataset)):
        sompath = output / f"{case.id}_t{somsample.tube}.npy"
        io_functions.save_som(somsample.data, sompath, save_config=False)
        somsample.data = None
        somsample.path = sompath
        casesamples[case.id].append(somsample)

    somcases = []
    for case in dataset:
        somcases.append(case.copy(samples=casesamples[case.id]))

    somcollection = case_dataset.CaseCollection(somcases)
    io_functions.save_json(somcollection, output + ".json")
    io_functions.save_json(
        {
            tube: {
                "dims": m.model.dims,
                "channels": m.model.markers,
            }
            for tube, m in model.models.items()
        }, output + "_config.json")
Esempio n. 3
0
def run_transfer(options, train_dataset, validate_dataset):
    config = options["config"]

    base_model = models.load_model(options["base_model_path"])

    tl_model = create_tl_model(base_model, config)

    model = SOMClassifier(config, tl_model)
    train = model.create_sequence(train_dataset, config.train_batch_size)

    if validate_dataset is not None:
        validate = model.create_sequence(validate_dataset, config.valid_batch_size)
    else:
        validate = None

    model.train_generator(train, validate, epochs=config.train_epochs, class_weight=None)

    output = utils.URLPath(options["output_path"])

    if validate:
        pred_arr, pred_labels = model.predict_generator(validate)
        true_labels = validate.true_labels
        pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels)
        io_functions.save_csv(pred_df, output / "preds.csv")
        io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json")
        generate_all_metrics(true_labels, pred_labels, config.mapping, output)

    model.save(output)
    model.save_information(output)

    keras.backend.clear_session()
    del model
Esempio n. 4
0
def create_roc_results(trues, preds, output, model):
    """Create ROC and AUC metrics and save them to the given directory."""
    output.mkdir()
    curves = {}
    groups = model.config.groups
    auc = {}

    try:
        for i, group in enumerate(groups):
            curves[group] = metrics.roc_curve(trues[:, i], preds[:, i])

        for i, group in enumerate(groups):
            auc[group] = metrics.roc_auc_score(trues[:, i], preds[:, i])

        macro_auc = metrics.roc_auc_score(trues, preds, average="macro")
        micro_auc = metrics.roc_auc_score(trues, preds, average="micro")
        io_functions.save_json(
            {
                "one-vs-rest": auc,
                "macro": macro_auc,
                "micro": micro_auc,
            },
            output / "auc.json")
    except ValueError:
        curves[group] = metrics.roc_curve(trues[:, i], preds[:, i])
        io_functions.save_json(
            {
                "one-vs-rest": 0,
                "macro": 0,
                "micro": 0,
            }, output / "auc.json")
        pass

    return auc, curves
Esempio n. 5
0
def transform_cases(dataset, model, output):
    """Create individidual SOMs for all cases in the dataset.
    Args:
        dataset: CaseIterable with a number of cases, for which SOMs should be
                 generated.
        model: Model with initial weights, which should be used for generation
               of SOMs.
        output: Output directory for SOMs

    Returns:
        Nothing.
    """
    output.mkdir()
    casesamples = defaultdict(list)
    for case, somsample in utils.time_generator_logger(model.transform_generator(dataset)):
        sompath = output / f"{case.id}_t{somsample.tube}.npy"
        io_functions.save_som(somsample.data, sompath, save_config=False)
        somsample.data = None
        somsample.path = sompath
        casesamples[case.id].append(somsample)

    somcases = []
    for case in dataset:
        somcases.append(case.copy(samples=casesamples[case.id]))

    somcollection = case_dataset.CaseCollection(somcases)
    io_functions.save_json(somcollection, output + ".json")

    labels = [{"label": case.id, "randnum": 0, "group": case.group} for case in dataset]
Esempio n. 6
0
def create_threshold_results(trues, preds, output, model):
    """Create threshold results from true and predicted."""
    # calculate accuracy for a certain certainty
    # how about w score above 0.95?
    output.mkdir()
    threshold_results = []
    for threshold in np.arange(0.25, 1.0, 0.05):
        index_above = np.argwhere(np.any(preds > threshold, axis=1)).squeeze()
        sel_preds = preds[index_above, :]
        sel_trues = trues[index_above, :]
        pred_labels = model.binarizer.inverse_transform(sel_preds)
        true_labels = model.binarizer.inverse_transform(sel_trues)
        included = len(index_above) / len(preds)
        acc = metrics.accuracy_score(true_labels, pred_labels)
        print(threshold, included, acc)
        threshold_results.append((threshold, included, acc))
    io_functions.save_json(threshold_results, output / "thresholds.json")

    tarr = np.array(threshold_results)
    fig, ax = plt.subplots()
    ax.plot(tarr[:, 0], tarr[:, 1], label="included")
    ax.plot(tarr[:, 0], tarr[:, 2], label="acc")
    ax.legend()
    ax.set_xlabel("Score threshold")
    ax.set_ylabel("Classification accuracy / Included cases ratio")
    fig.savefig(str(output / "threshold.png"), dpi=300)
Esempio n. 7
0
    def save(self, path: utils.URLPath):
        """Save the given classifier model to the given path."""
        save_somclassifier_config(self.config, path / "config.json")
        self.model.save(str(path / "model.h5"))
        io_functions.save_joblib(self.binarizer, path / "binarizer.joblib")

        io_functions.save_json(self.data_ids["validation"],
                               path / "ids_validate.json")
        io_functions.save_json(self.data_ids["train"], path / "ids_train.json")
Esempio n. 8
0
def create_roc_results(trues, preds, model, output):
    """Create ROC and AUC metrics and save them to the given directory."""
    groups = model.config.groups
    roc = fc_roc.calculate_roc(trues, preds, groups)
    io_functions.save_json(roc, output / "auc.json")

    fig, ax = plt.subplots()
    fc_roc.plot_roc_curves(ax, fc_roc.calculate_roc_curve(trues, preds, groups))

    fig.tight_layout()
    fig.savefig(str(output / "roc.png"), dpi=300)
Esempio n. 9
0
def generate_metrics(true_labels, pred_labels, groups, output):
    """Generate numeric metrics."""
    metrics_results = {
        "balanced": metrics.balanced_accuracy_score(true_labels, pred_labels),
        "f1_micro": metrics.f1_score(true_labels, pred_labels, average="micro"),
        "f1_macro": metrics.f1_score(true_labels, pred_labels, average="macro"),
        "mcc": metrics.matthews_corrcoef(true_labels, pred_labels),
    }
    print(metrics_results)
    io_functions.save_json(metrics_results, output / "validation_metrics.json")
    return metrics_results
Esempio n. 10
0
def main():
    bonn_config = io_functions.load_json("output/00-dataset-test/bonn_config.json")
    munich_config = io_functions.load_json("output/00-dataset-test/train_config.json")

    selected = {}
    for tube, markers in bonn_config["selected_markers"].items():
        selected[tube] = []
        munich_tube = [remove_stem(m) for m in munich_config["selected_markers"][tube]]
        for marker in markers:
            marker_stem = remove_stem(marker)
            if marker_stem in munich_tube:
                selected[tube].append(marker_stem)
    print(selected)

    io_functions.save_json(selected, "output/00-dataset-test/munich_bonn_tubes.json")
Esempio n. 11
0
def main(
    fcsdata: utils.URLPath,
    fcsmeta: utils.URLPath,
    somdata: utils.URLPath,
    output: utils.URLPath,
):

    fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta)
    try:
        som_config = io_functions.load_json(somdata + "_config.json")
    except FileNotFoundError:
        som_config = None

    if som_config is None:
        selected_markers = fcs_dataset.selected_markers
    else:
        selected_markers = {t: d["channels"] for t, d in som_config.items()}

    tubes = ("1", "2", "3")

    model = quantization_error_model()
    sess = tf.Session()
    results = []
    for fcscase in fcs_dataset:
        print(fcscase)
        for tube in tubes:
            fcssample = fcscase.get_tube(tube, kind="fcs").get_data()
            somsample = get_som_data(fcscase.id, tube, somdata,
                                     selected_markers[tube])
            error = sample_quantization_error(fcssample, somsample, model,
                                              sess)
            results.append((fcscase.id, tube, error))

    stats = {}
    stats["mean"] = {
        t: sum(r[-1] for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    stats["variance"] = {
        t: sum(
            np.power(r[-1] - stats["mean"][t], 2)
            for r in results if r[1] == t) / len(results)
        for t in tubes
    }
    print("Mean quantization error", stats)

    io_functions.save_json(results, output / "quantization_error.json")
    io_functions.save_json(stats, output / "quantization_error_mean.json")
Esempio n. 12
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """Split test and train dataset, remove duplicates and create a list of
    ids used for creating the reference SOM.

    Args:
        data: Path to fcs data.
        meta: Path to case metadata using case_info format.
        output: Dath to output split dataset information.
    """
    cases = io_functions.load_case_collection_from_caseinfo(data, meta)
    train, test = preprocess_cases(cases)
    reference = filter_reference(train)
    output.mkdir()
    io_functions.save_case_collection(train, output / "train.json.gz")
    io_functions.save_case_collection(test, output / "test.json.gz")
    io_functions.save_json(reference.labels, output / "references.json")
Esempio n. 13
0
def predict(
    data: utils.URLPath,
    model: utils.URLPath,
    output: utils.URLPath,
    labels: utils.URLPath = None,
    metrics: bool = True,
):
    """Generate predictions and plots for a single case.

    Args:
        data: SOM dataset.
        model: Path to model containing CNN and SOMs.
        output: Destination for plotting.
        labels: List of case ids to be filtered for generating predictions.
    """
    print(f"Loaded cases from {data}")
    dataset = som_dataset.SOMDataset.from_path(data)
    if labels:
        labels = io_functions.load_json(labels)
        dataset = dataset.filter(labels=labels)

    model = classifier.SOMClassifier.load(model)
    data_sequence = model.create_sequence(dataset, 128)

    values, pred_labels = model.predict_generator(data_sequence)

    pred_json = {
        id: dict(zip(model.config.groups, value.tolist()))
        for id, value in zip(dataset.labels, values)
    }

    io_functions.save_json(pred_json, output / "prediction.json")

    if metrics:
        true_labels = data_sequence.true_labels
        map_config = [("unmapped", {
            "groups": model.config.groups,
            "map": {}
        }), *GROUP_MAPS.items()]
        for map_name, mapping in map_config:
            print(f"--- MAPPING: {map_name} ---")
            if len(mapping["groups"]) > len(model.config.groups):
                continue
            fc_predictions.generate_all_metrics(true_labels, pred_labels,
                                                mapping, output / map_name)
Esempio n. 14
0
def run_kfold(*, output_path, base_model_path, som_dataset_path, k_number=5, panel="MLL", rerun=False, stratified=False,):
    if not rerun and output_path.exists():
        LOGGER.info("Existing results exist at %s skipping", output_path)
        return

    args = locals()
    io_functions.save_json(args, output_path / "params.json")

    # set the groups according to the panel
    if panel == "MLL":
        groups = GROUPS
    elif panel == "ERLANGEN":
         groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    # tubes to be processed for merged samples
    tubes = ("1")

    mapping = {"groups": groups, "map": None}

    dataset = som_dataset.SOMDataset.from_path(som_dataset_path)
    LOGGER.info("Full dataset %s", dataset.group_count)

    splits = create_kfold_split(dataset, k_number=k_number, stratified=stratified)

    for n, (train_dataset, validate_dataset) in enumerate(splits):
        LOGGER.info(f"SPLIT n={n}")
        LOGGER.info("Train dataset %s", train_dataset.group_count)
        LOGGER.info("Validation dataset %s", validate_dataset.group_count)

        # change epochs to suit each dataset
        options = {
            "base_model_path": str(base_model_path / "model.h5"),
            "output_path": output_path / f"kfold_n{n}",
            "config": classifier.SOMClassifierConfig(**{
                "tubes": {tube: dataset.config[tube] for tube in tubes},
                "groups": groups,
                "pad_width": 2,
                "mapping": mapping,
                "cost_matrix": None,
                "train_epochs": 15,
            })
        }
        run_transfer(options, train_dataset, validate_dataset)
Esempio n. 15
0
    def save_information(self, path: utils.URLPath):
        """Save additional plots and information."""
        # Text summary of model
        with (path / "model_summary.txt").open("w") as summary_file:

            def print_file(*args, **kwargs):
                print(*args, **kwargs, file=summary_file)

            self.model.summary(print_fn=print_file)

        # Image plotting structure of model
        keras.utils.plot_model(self.model,
                               to_file=str(path / "model_plot.png"))

        # plot all training history
        for i, (meta, history) in enumerate(self.training_history):
            training_output = path / f"train_{i}"
            io_functions.save_json(meta, training_output / "info.json")
            plot_training_history(history, training_output / "training.png")
Esempio n. 16
0
def run_denovo(options, train_dataset, validate_dataset):
    config = options["config"]

    model = train_som_classifier(train_dataset, validate_dataset, config)

    output = utils.URLPath(options["output_path"])

    if validate_dataset:
        validate = model.create_sequence(validate_dataset, config.valid_batch_size)

        pred_arr, pred_labels = model.predict_generator(validate)
        true_labels = validate.true_labels
        pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels)
        io_functions.save_csv(pred_df, output / "preds.csv")
        io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json")
        generate_all_metrics(true_labels, pred_labels, config.mapping, output)

    model.save(output)
    model.save_information(output)

    keras.backend.clear_session()
    del model
Esempio n. 17
0
def transform_dataset_to_som(som_reference: CaseSom, dataset: "CaseCollection",
                             output: utils.URLPath):
    """Transform dataset into som dataste using the given reference SOM model.
    """
    print(f"Trainsforming individual samples")
    data_output = output / "data"
    meta_output = output / "meta.json.gz"
    config_output = output / "config.json"

    data_output.mkdir()

    casesamples = defaultdict(list)
    count_samples = len(dataset) * len(som_reference.models)
    countlen = len(str(count_samples))
    for i, (case, somsample) in enumerate(
            utils.time_generator_logger(
                som_reference.transform_generator(dataset))):
        sompath = data_output / f"{case.id}_t{somsample.tube}.npy"
        io_functions.save_som(somsample.data, sompath, save_config=False)
        somsample.data = None
        somsample.path = sompath.relative_to(data_output)
        print(type(somsample.path), somsample.path)
        casesamples[case.id].append(somsample)
        print(
            f"[{str(i + 1).rjust(countlen, ' ')}/{count_samples}] Created tube {somsample.tube} for {case.id}"
        )

    print(f"Saving result to new collection at {output}")
    som_dataset = case_dataset.CaseCollection(
        [case.copy(samples=casesamples[case.id]) for case in dataset],
        data_path=data_output)
    som_dataset.selected_markers = {
        m.tube: m.model.markers
        for m in som_reference.models.values()
    }
    io_functions.save_case_collection(som_dataset, meta_output)
    io_functions.save_json(som_reference.som_config, config_output)
    return som_dataset
Esempio n. 18
0
OUTPUT = utils.URLPath(f"output/{NAME}")
LOGDIR = utils.URLPath(f"logs/{NAME}_{utils.create_stamp()}")
INPUT = {
    "data": utils.URLPath("output/ungated/data"),
    "meta": utils.URLPath("output/samples/meta.json.gz"),
}

LOGGER = utils.setup_logging(LOGDIR, NAME)

set_seed(SEED)
dataset = io_functions.load_case_collection(INPUT["data"], INPUT["meta"])

check_dataset(dataset)

train, test = dataset.create_split(0.9)
io_functions.save_json(train.labels, OUTPUT / "train_ids.json")
io_functions.save_json(test.labels, OUTPUT / "test_ids.json")

reference = train.sample(1)
LOGGER.info("Reference dataset: %s", reference)
LOGGER.info("Reference labels: %s", reference.labels)

model = flowcat.FlowCat()
args = constants.DEFAULT_TRAIN_ARGS
args["classifier"]["balance"] = None
args["classifier"]["split_ratio"] = 1.0
args["classifier"]["config"].tubes = ["1", "2"]
args["classifier"]["config"].train_epochs = 150
som_train, som_test = model.train(train,
                                  reference,
                                  OUTPUT,
Esempio n. 19
0
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 0

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    # train = train.balance(20)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1)
    binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    xdata, ydata = sequence_to_array(trainseq)

    model.fit(xdata, ydata)

    xtest, ytest = sequence_to_array(validseq)
    pred_arr = model.predict(xtest)

    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(
        true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Esempio n. 20
0
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath,
         model: utils.URLPath):
    data, meta, soms, model = map(utils.URLPath, [
        "/data/flowcat-data/mll-flowdata/decCLL-9F",
        "output/0-final-dataset/train.json.gz",
        "output/som-fix-test/soms-test/som_r4_1",
        "output/0-final/classifier-minmax-new",
    ])
    dataset = io_functions.load_case_collection(data, meta)
    soms = som_dataset.SOMDataset.from_path(soms)
    model = SaliencySOMClassifier.load(model)
    val_dataset = model.get_validation_data(dataset)
    val_seq = model.create_sequence(soms)

    # printing out weights and biases, unsure whether they actually contain
    # information
    # in theory we could extend that to attempt to describe them as gates
    tube = "3"
    weights, biases = model.model.layers[int(tube) + 2].get_weights()
    for j, chname in enumerate(model.config["tubes"][tube]["channels"]):
        ch_mean_weight = np.mean(weights[:, :, j, :])
        print(j, chname, ch_mean_weight)

    for i in range(weights.shape[-1]):
        mean_weight = np.mean(weights[:, :, :, i])
        print(i, mean_weight, biases[i])
        for j, chname in enumerate(model.config["tubes"]["1"]["channels"]):
            print(i, j, chname)
            print(weights[:, :, j, i])

    # zero out specific columns and see how that impacts performance
    output = utils.URLPath("output/0-final/model-analysis/occlusion")
    for group in model.config["groups"]:
        print(group)
        sel_cases = val_dataset.filter(groups=[group])
        avg_results = model.channel_occlusion(sel_cases, val_seq)
        print(sorted(avg_results, key=lambda t: t[2], reverse=True))
        io_functions.save_json(avg_results, output / f"{group}_avg_std.json")

    # case_som = soms.get_labels([case.id]).iloc[0]
    hcls = val_dataset.filter(groups=["HCL"])
    from collections import defaultdict
    max_vals = defaultdict(lambda: defaultdict(list))
    mean_vals = defaultdict(lambda: defaultdict(list))
    for case in hcls:
        print(case)
        gradient = model.calculate_saliency(val_seq,
                                            case,
                                            case.group,
                                            maximization=False)
        for i, (tube, markers) in enumerate(model.config["tubes"].items()):
            tgrad = gradient[i]
            for i, marker in enumerate(markers["channels"]):
                mgrad = tgrad[:, :, i]
                gmax = np.max(mgrad)
                max_vals[tube][marker].append(gmax)
                gmean = np.mean(mgrad)
                mean_vals[tube][marker].append(gmean)
    max_markers = defaultdict(list)
    for tube, markers in model.config["tubes"].items():
        for marker in markers["channels"]:
            print("Max", tube, marker, np.mean(max_vals[tube][marker]))
            print("Mean", tube, marker, np.mean(mean_vals[tube][marker]))
            max_markers[tube].append((marker, np.mean(max_vals[tube][marker])))

    for tube in model.config["tubes"]:
        print("Tube", tube)
        print("\n".join(": ".join((t[0], str(t[1]))) for t in sorted(
            max_markers[tube], key=lambda t: t[1], reverse=True)))

    c_model = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/model_0.h5"
    c_labels = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/test_labels.json"
    c_preds = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/predictions_0.csv"
    c_config = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/config.json"
    c_cases = MLLDATA / "mll-flowdata/CLL-9F"
    c_sommaps = MLLDATA / "mll-sommaps/sample_maps/selected1_toroid_s32"
    c_misclass = MLLDATA / "mll-sommaps/misclassifications/"
    c_tube = [1, 2]

    # load datasets
    somdataset = sd.SOMDataset.from_path(c_sommaps)
    cases = cc.CaseCollection.from_path(c_cases, how="case_info.json")

    # filter datasets
    test_labels = flowutils.load_json(c_labels)

    filtered_cases = cases.filter(labels=test_labels)
    somdataset.data[1] = somdataset.data[1].loc[test_labels, :]

    # get mapping
    config = flowutils.load_json(c_config)
    groupinfo = mappings.GROUP_MAPS[config["c_groupmap"]]

    dataset = cd.CombinedDataset(filtered_cases, {
        dd.Dataset.from_str('SOM'): somdataset,
        dd.Dataset.from_str('FCS'): filtered_cases
    },
                                 group_names=groupinfo['groups'])

    # modify mapping
    dataset.set_mapping(groupinfo)

    xoutputs = [
        loaders.loader_builder(
            loaders.Map2DLoader.create_inferred,
            tube=1,
            sel_count="counts",
            pad_width=1,
        ),
        loaders.loader_builder(
            loaders.Map2DLoader.create_inferred,
            tube=2,
            sel_count="counts",
            pad_width=1,
        )
    ]

    dataset = loaders.DatasetSequence.from_data(dataset,
                                                xoutputs,
                                                batch_size=1,
                                                draw_method="sequential")

    predictions = pd.read_csv(c_preds, index_col=0)

    predictions = add_correct_magnitude(predictions)
    predictions = add_infiltration(predictions, cases)

    misclass_labels = ['507777582649cbed8dfb3fe552a6f34f8b6c28e3']

    for label in misclass_labels:
        label_path = pathlib.Path(f"{c_misclass}/{label}")
        if not label_path.exists():
            label_path.mkdir()

        case = cases.get_label(label)

        #get the actual and the predicited class
        corr_group = predictions.loc[case.id, "correct"]
        pred_group = predictions.loc[case.id, "pred"]
        classes = [corr_group, pred_group]

        gradients = plotting.calc_saliency(dataset,
                                           case,
                                           c_model,
                                           classes=classes)

        for tube in c_tube:

            heatmaps = plotting.draw_saliency_heatmap(case, gradients, classes,
                                                      tube)
            for idx, heatmap in enumerate(heatmaps):
                plotting.save_figure(
                    heatmap,
                    f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_saliency_heatmap.png"
                )

            scatterplots = plotting.plot_tube(case,
                                              tube,
                                              gradients[tube - 1],
                                              classes=classes,
                                              sommappath=c_sommaps)
            for idx, scatterplot in enumerate(scatterplots):
                plotting.save_figure(
                    scatterplot,
                    f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_scatterplots.png"
                )
Esempio n. 21
0
print(metadata[0])

data = [meta_to_case(meta, fcs_data_path) for meta in metadata]

dataset = CaseCollection(data, data_path=fcs_data_path)

outpath = URLPath("output/5-berlin-data-test/dataset")

save_case_collection(dataset, dataset_path / "casecollection.json")

# only use groups we already have for now
group_dataset, reasons = dataset.filter_reasons(groups=GROUPS)
save_case_collection(group_dataset, dataset_path / "valid_groups.json")

invalid_labels = [l for l, _ in reasons]
invalid_dataset, _ = dataset.filter_reasons(labels=invalid_labels)
save_case_collection(invalid_dataset, dataset_path / "invalid_groups.json")

selected = flowcat.marker_selection.get_selected_markers(group_dataset,
                                                         ("1", "2", "3", "4"),
                                                         marker_threshold=0.9)
save_case_collection(selected, outpath / "known_groups.json")

references = selected.sample(num=1)
save_json(references, output / "references.json")

selected_invalid, _ = invalid_dataset.filter_reasons(
    selected_markers=selected.selected_markers)
save_case_collection(selected_invalid, outpath / "unknown_groups.json")
Esempio n. 22
0
    somcases = []
    for case in dataset:
        somcases.append(case.copy(samples=casesamples[case.id]))

    somcollection = case_dataset.CaseCollection(somcases)
    io_functions.save_json(somcollection, output + ".json")

    labels = [{"label": case.id, "randnum": 0, "group": case.group} for case in dataset]
    # Save metadata into an additional csv file with the same name
    metadata = pd.DataFrame(labels)
    io_functions.save_csv(metadata, output + ".csv")
    io_functions.save_json(
        {
            tube: {
                "dims": m.model.dims,
                "channels": m.model.markers,
            } for tube, m in model.models.items()
        }, output + "_config.json")


def main(args):
    """Load a model with given transforming arguments and transform individual
    cases."""
    cases = io_functions.load_case_collection(args.data, args.meta)
    # cases = cases.sample(1, groups=["CLL", "normal"])
    selected_markers = cases.selected_markers
    marker_name_only = False

    if args.tensorboard:
        tensorboard_dir = args.output / "tensorboard"
Esempio n. 23
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    binarizer, model = load_model(args.model)

    trainseq = som_dataset.SOMSequence(train_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)

    model.fit_generator(generator=trainseq,
                        epochs=10,
                        validation_data=validseq)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels,
                           OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
Esempio n. 24
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to fcs dataset data
        meta: Path to fcs dataset metainformation
        output: Output path
    """
    tubes = ("1", "2")
    sample_size = 512
    # group_mapping = mappings.GROUP_MAPS["6class"]
    # mapping = group_mapping["map"]
    mapping = None
    groups = mappings.GROUPS
    # groups = group_mapping["groups"]

    dataset = io_functions.load_case_collection(data, meta)
    if mapping:
        dataset = dataset.map_groups(mapping)
    dataset = dataset.filter(groups=groups)

    validate, train = dataset.create_split(50)
    print(train.group_count)
    # train = train.balance(1000).shuffle()
    train = train.sample(100).shuffle()
    print(train.group_count)

    group_count = train.group_count
    group_weights = classification_utils.calculate_group_weights(group_count)
    group_weights = {
        i: group_weights.get(g, 1.0)
        for i, g in enumerate(groups)
    }

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    train_seq = FCSSequence(train,
                            binarizer,
                            tubes=tubes,
                            sample_size=sample_size,
                            batch_size=64)
    validate_seq = FCSSequence(validate,
                               binarizer,
                               tubes=tubes,
                               sample_size=sample_size,
                               batch_size=128)

    config = {
        "tubes": tubes,
        "groups": groups,
    }
    io_functions.save_json(config, output / "config.json")

    # for tube in tubes:
    #     x, y, z = selected_tubes[tube]["dims"]
    #     selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 2,
        ("MBL", "normal"): 2,
        ("MCL", "normal"): 2,
        ("PL", "normal"): 2,
        ("LPL", "normal"): 2,
        ("MZL", "normal"): 2,
        ("FL", "normal"): 2,
        ("HCL", "normal"): 2,
    }
    cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)

    model = create_fcs_model(train_seq.xshape,
                             train_seq.yshape,
                             global_decay=5e-5)
    model.compile(
        # loss="categorical_crossentropy",
        # loss=keras.losses.CategoricalCrossentropy(),
        loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix),
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            "acc",
            # keras.metrics.CategoricalAccuracy(),
            # keras.metrics.TopKCategoricalAccuracy(k=2),
            # top2_acc,
        ])
    model.summary()

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=20,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=train_seq,
        validation_data=validate_seq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validate_seq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validate_seq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Esempio n. 25
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("2", "3", "4")
    pad_width = 1

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")])

    dataset_groups = {d.group for d in dataset}

    # if set(groups) != dataset_groups:
    #     raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    validate, train = dataset.create_split(10, stratify=True)

    group_count = train.group_count
    num_cases = sum(group_count.values())
    balanced_nums = num_cases / len(dataset_groups)
    balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups]
    min_ratio = min(balanced_loss_weights)
    balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)}
    print(balanced_loss_weights)

    # train = train.balance(2000)
    # train = train.balance_per_group({
    #     "CM": 6000,
    #     # "CLL": 4000,
    #     # "MBL": 2000,
    #     "MCL": 1000,
    #     "PL": 1000,
    #     "LPL": 1000,
    #     "MZL": 1000,
    #     "FL": 1000,
    #     "HCL": 1000,
    #     "normal": 6000,
    # })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=15, shuffle=True,
        callbacks=[tensorboard_callback, nan_callback],
        class_weight=balanced_loss_weights,
        generator=trainseq, validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups)
    print(groups)
    print(confusion)
    balanced = metrics.balanced_accuracy_score(true_labels, pred_labels)
    print(balanced)
Esempio n. 26
0
def save_somclassifier_config(config: "SOMClassifierConfig",
                              path: utils.URLPath):
    """Save configuration to the given path."""
    io_functions.save_json(config.to_json(), path)
Esempio n. 27
0
def main(data: utils.URLPath,
         meta: utils.URLPath,
         output: utils.URLPath,
         epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 2

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(
            f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    group_weights = None
    # group_count = train.group_count
    # group_weights = classification_utils.calculate_group_weights(group_count)
    # group_weights = {
    #     i: group_weights.get(g, 1.0) for i, g in enumerate(groups)
    # }

    # train = train.balance(2000)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    # always (true, pred)
    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 8,
        ("MBL", "normal"): 8,
        ("MCL", "normal"): 8,
        ("PL", "normal"): 8,
        ("LPL", "normal"): 8,
        ("MZL", "normal"): 8,
        ("FL", "normal"): 8,
        ("HCL", "normal"): 8,
    }
    if mapping:
        cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v
                        for (a, b), v in cost_mapping.items()}
    # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)
    # np.save(str(output / "cost_matrix.npy"), cost_matrix)
    cost_matrix = None

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width,
                                        z)

    binarizer, model = get_model(selected_tubes,
                                 groups=groups,
                                 global_decay=5e-5)

    if cost_matrix is not None:
        loss = classification_utils.WeightedCategoricalCrossentropy(
            cost_matrix)
    else:
        loss = "categorical_crossentropy"

    model.compile(
        loss=loss,
        # loss="categorical_crossentropy",
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            keras.metrics.CategoricalAccuracy(),
        ])
    with (output / "model_summary.txt").open("w") as summary_file:

        def print_file(*args, **kwargs):
            print(*args, **kwargs, file=summary_file)

        model.summary(print_fn=print_file)

    keras.utils.plot_model(model, to_file=str(output / "model_plot.png"))

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(train,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=32,
                           pad_width=pad_width)
    validseq = SOMSequence(validate,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=128,
                           pad_width=pad_width)

    # tensorboard_dir = str(output / "tensorboard")
    # tensorboard_callback = keras.callbacks.TensorBoard(
    #     log_dir=str(tensorboard_dir),
    #     histogram_freq=5,
    #     write_grads=True,
    #     write_images=True,
    # )
    nan_callback = keras.callbacks.TerminateOnNaN()

    history = model.fit_generator(
        epochs=epochs,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=trainseq,
        validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)

    plot_training_history(history, output / "training.png")
Esempio n. 28
0
    utils.URLPath(
        "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json"
    ))

OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F")

setup_logging(None, "generate ref SOM for merged FCS")

ref_dataset = dataset.filter(labels=references)

tensorboard_dir = None

# Discover channels in the given dataset
markers = get_tube_marker(ref_dataset)
# markers = read_sel_markers(sel_markers)
print(markers)

io_functions.save_json(markers, OUTPUT / "markers.json")

# set marker_name_only to True
ref_config = DEFAULT_REFERENCE_SOM_ARGS.copy()
ref_config.marker_name_only = True

model = fc_som.CaseSom(
    tubes=markers,
    tensorboard_dir=tensorboard_dir,
    modelargs=ref_config,
)
model.train(ref_dataset)
io_functions.save_casesom(model, OUTPUT / "sommodel")
Esempio n. 29
0
def main(args):
    MLL5F = som_dataset.SOMDataset.from_path(args.input)
    OUTPUT = args.output
    #val_labels = args.val
    #train_labels = args.train
    #labels = args.labels
    LOGGER = utils.logs.setup_logging(None, "classify")

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    #vallabels = io_functions.load_json(val_labels)
    #validate_dataset = MLL5F.filter(labels=vallabels)

    #labels = io_functions.load_json(train_labels)
    #train_dataset = MLL5F.filter(labels=labels)

    #labels = io_functions.load_json(labels)
    #train_dataset = MLL5F.filter(labels=labels)

   
    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        MLL5F,
        split_ratio=0.90,
        groups=groups,
        mapping=mapping,
        balance=None)#, val_dataset = validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes},
                                               "groups": groups,
                                               "pad_width": 2,
                                               "mapping": mapping,
                                               "cost_matrix": None,
                                               })

    model = create_model(config.inputs, 1, global_decay=5e-3)

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            "acc",
        ]
    )

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)

    model.fit_generator(generator=trainseq, validation_data=validseq,
                                epochs=20, shuffle=True, class_weight=None)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")