def create_roc_results(trues, preds, output, model): """Create ROC and AUC metrics and save them to the given directory.""" output.mkdir() curves = {} for i, group in enumerate(model.config["groups"]): curves[group] = metrics.roc_curve(trues[:, i], preds[:, i]) auc = {} for i, group in enumerate(model.config["groups"]): auc[group] = metrics.roc_auc_score(trues[:, i], preds[:, i]) macro_auc = metrics.roc_auc_score(trues, preds, average="macro") micro_auc = metrics.roc_auc_score(trues, preds, average="micro") io_functions.save_json( { "one-vs-rest": auc, "macro": macro_auc, "micro": micro_auc, }, output / "auc.json") fig, ax = plt.subplots() for name, curve in curves.items(): ax.plot(curve[0], curve[1], label=name) ax.plot((0, 1), (0, 1), "k--") ax.legend() ax.set_xlabel("False positive rate") ax.set_ylabel("True positive rate") ax.set_title("ROC one-vs-rest") fig.tight_layout() fig.savefig(str(output / "roc.png"), dpi=300) plt.close()
def transform_data(dataset, model, output): output.mkdir() casesamples = defaultdict(list) for case, somsample in utils.time_generator_logger( model.transform_generator(dataset)): sompath = output / f"{case.id}_t{somsample.tube}.npy" io_functions.save_som(somsample.data, sompath, save_config=False) somsample.data = None somsample.path = sompath casesamples[case.id].append(somsample) somcases = [] for case in dataset: somcases.append(case.copy(samples=casesamples[case.id])) somcollection = case_dataset.CaseCollection(somcases) io_functions.save_json(somcollection, output + ".json") io_functions.save_json( { tube: { "dims": m.model.dims, "channels": m.model.markers, } for tube, m in model.models.items() }, output + "_config.json")
def run_transfer(options, train_dataset, validate_dataset): config = options["config"] base_model = models.load_model(options["base_model_path"]) tl_model = create_tl_model(base_model, config) model = SOMClassifier(config, tl_model) train = model.create_sequence(train_dataset, config.train_batch_size) if validate_dataset is not None: validate = model.create_sequence(validate_dataset, config.valid_batch_size) else: validate = None model.train_generator(train, validate, epochs=config.train_epochs, class_weight=None) output = utils.URLPath(options["output_path"]) if validate: pred_arr, pred_labels = model.predict_generator(validate) true_labels = validate.true_labels pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels) io_functions.save_csv(pred_df, output / "preds.csv") io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json") generate_all_metrics(true_labels, pred_labels, config.mapping, output) model.save(output) model.save_information(output) keras.backend.clear_session() del model
def create_roc_results(trues, preds, output, model): """Create ROC and AUC metrics and save them to the given directory.""" output.mkdir() curves = {} groups = model.config.groups auc = {} try: for i, group in enumerate(groups): curves[group] = metrics.roc_curve(trues[:, i], preds[:, i]) for i, group in enumerate(groups): auc[group] = metrics.roc_auc_score(trues[:, i], preds[:, i]) macro_auc = metrics.roc_auc_score(trues, preds, average="macro") micro_auc = metrics.roc_auc_score(trues, preds, average="micro") io_functions.save_json( { "one-vs-rest": auc, "macro": macro_auc, "micro": micro_auc, }, output / "auc.json") except ValueError: curves[group] = metrics.roc_curve(trues[:, i], preds[:, i]) io_functions.save_json( { "one-vs-rest": 0, "macro": 0, "micro": 0, }, output / "auc.json") pass return auc, curves
def transform_cases(dataset, model, output): """Create individidual SOMs for all cases in the dataset. Args: dataset: CaseIterable with a number of cases, for which SOMs should be generated. model: Model with initial weights, which should be used for generation of SOMs. output: Output directory for SOMs Returns: Nothing. """ output.mkdir() casesamples = defaultdict(list) for case, somsample in utils.time_generator_logger(model.transform_generator(dataset)): sompath = output / f"{case.id}_t{somsample.tube}.npy" io_functions.save_som(somsample.data, sompath, save_config=False) somsample.data = None somsample.path = sompath casesamples[case.id].append(somsample) somcases = [] for case in dataset: somcases.append(case.copy(samples=casesamples[case.id])) somcollection = case_dataset.CaseCollection(somcases) io_functions.save_json(somcollection, output + ".json") labels = [{"label": case.id, "randnum": 0, "group": case.group} for case in dataset]
def create_threshold_results(trues, preds, output, model): """Create threshold results from true and predicted.""" # calculate accuracy for a certain certainty # how about w score above 0.95? output.mkdir() threshold_results = [] for threshold in np.arange(0.25, 1.0, 0.05): index_above = np.argwhere(np.any(preds > threshold, axis=1)).squeeze() sel_preds = preds[index_above, :] sel_trues = trues[index_above, :] pred_labels = model.binarizer.inverse_transform(sel_preds) true_labels = model.binarizer.inverse_transform(sel_trues) included = len(index_above) / len(preds) acc = metrics.accuracy_score(true_labels, pred_labels) print(threshold, included, acc) threshold_results.append((threshold, included, acc)) io_functions.save_json(threshold_results, output / "thresholds.json") tarr = np.array(threshold_results) fig, ax = plt.subplots() ax.plot(tarr[:, 0], tarr[:, 1], label="included") ax.plot(tarr[:, 0], tarr[:, 2], label="acc") ax.legend() ax.set_xlabel("Score threshold") ax.set_ylabel("Classification accuracy / Included cases ratio") fig.savefig(str(output / "threshold.png"), dpi=300)
def save(self, path: utils.URLPath): """Save the given classifier model to the given path.""" save_somclassifier_config(self.config, path / "config.json") self.model.save(str(path / "model.h5")) io_functions.save_joblib(self.binarizer, path / "binarizer.joblib") io_functions.save_json(self.data_ids["validation"], path / "ids_validate.json") io_functions.save_json(self.data_ids["train"], path / "ids_train.json")
def create_roc_results(trues, preds, model, output): """Create ROC and AUC metrics and save them to the given directory.""" groups = model.config.groups roc = fc_roc.calculate_roc(trues, preds, groups) io_functions.save_json(roc, output / "auc.json") fig, ax = plt.subplots() fc_roc.plot_roc_curves(ax, fc_roc.calculate_roc_curve(trues, preds, groups)) fig.tight_layout() fig.savefig(str(output / "roc.png"), dpi=300)
def generate_metrics(true_labels, pred_labels, groups, output): """Generate numeric metrics.""" metrics_results = { "balanced": metrics.balanced_accuracy_score(true_labels, pred_labels), "f1_micro": metrics.f1_score(true_labels, pred_labels, average="micro"), "f1_macro": metrics.f1_score(true_labels, pred_labels, average="macro"), "mcc": metrics.matthews_corrcoef(true_labels, pred_labels), } print(metrics_results) io_functions.save_json(metrics_results, output / "validation_metrics.json") return metrics_results
def main(): bonn_config = io_functions.load_json("output/00-dataset-test/bonn_config.json") munich_config = io_functions.load_json("output/00-dataset-test/train_config.json") selected = {} for tube, markers in bonn_config["selected_markers"].items(): selected[tube] = [] munich_tube = [remove_stem(m) for m in munich_config["selected_markers"][tube]] for marker in markers: marker_stem = remove_stem(marker) if marker_stem in munich_tube: selected[tube].append(marker_stem) print(selected) io_functions.save_json(selected, "output/00-dataset-test/munich_bonn_tubes.json")
def main( fcsdata: utils.URLPath, fcsmeta: utils.URLPath, somdata: utils.URLPath, output: utils.URLPath, ): fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta) try: som_config = io_functions.load_json(somdata + "_config.json") except FileNotFoundError: som_config = None if som_config is None: selected_markers = fcs_dataset.selected_markers else: selected_markers = {t: d["channels"] for t, d in som_config.items()} tubes = ("1", "2", "3") model = quantization_error_model() sess = tf.Session() results = [] for fcscase in fcs_dataset: print(fcscase) for tube in tubes: fcssample = fcscase.get_tube(tube, kind="fcs").get_data() somsample = get_som_data(fcscase.id, tube, somdata, selected_markers[tube]) error = sample_quantization_error(fcssample, somsample, model, sess) results.append((fcscase.id, tube, error)) stats = {} stats["mean"] = { t: sum(r[-1] for r in results if r[1] == t) / len(results) for t in tubes } stats["variance"] = { t: sum( np.power(r[-1] - stats["mean"][t], 2) for r in results if r[1] == t) / len(results) for t in tubes } print("Mean quantization error", stats) io_functions.save_json(results, output / "quantization_error.json") io_functions.save_json(stats, output / "quantization_error_mean.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """Split test and train dataset, remove duplicates and create a list of ids used for creating the reference SOM. Args: data: Path to fcs data. meta: Path to case metadata using case_info format. output: Dath to output split dataset information. """ cases = io_functions.load_case_collection_from_caseinfo(data, meta) train, test = preprocess_cases(cases) reference = filter_reference(train) output.mkdir() io_functions.save_case_collection(train, output / "train.json.gz") io_functions.save_case_collection(test, output / "test.json.gz") io_functions.save_json(reference.labels, output / "references.json")
def predict( data: utils.URLPath, model: utils.URLPath, output: utils.URLPath, labels: utils.URLPath = None, metrics: bool = True, ): """Generate predictions and plots for a single case. Args: data: SOM dataset. model: Path to model containing CNN and SOMs. output: Destination for plotting. labels: List of case ids to be filtered for generating predictions. """ print(f"Loaded cases from {data}") dataset = som_dataset.SOMDataset.from_path(data) if labels: labels = io_functions.load_json(labels) dataset = dataset.filter(labels=labels) model = classifier.SOMClassifier.load(model) data_sequence = model.create_sequence(dataset, 128) values, pred_labels = model.predict_generator(data_sequence) pred_json = { id: dict(zip(model.config.groups, value.tolist())) for id, value in zip(dataset.labels, values) } io_functions.save_json(pred_json, output / "prediction.json") if metrics: true_labels = data_sequence.true_labels map_config = [("unmapped", { "groups": model.config.groups, "map": {} }), *GROUP_MAPS.items()] for map_name, mapping in map_config: print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(model.config.groups): continue fc_predictions.generate_all_metrics(true_labels, pred_labels, mapping, output / map_name)
def run_kfold(*, output_path, base_model_path, som_dataset_path, k_number=5, panel="MLL", rerun=False, stratified=False,): if not rerun and output_path.exists(): LOGGER.info("Existing results exist at %s skipping", output_path) return args = locals() io_functions.save_json(args, output_path / "params.json") # set the groups according to the panel if panel == "MLL": groups = GROUPS elif panel == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] # tubes to be processed for merged samples tubes = ("1") mapping = {"groups": groups, "map": None} dataset = som_dataset.SOMDataset.from_path(som_dataset_path) LOGGER.info("Full dataset %s", dataset.group_count) splits = create_kfold_split(dataset, k_number=k_number, stratified=stratified) for n, (train_dataset, validate_dataset) in enumerate(splits): LOGGER.info(f"SPLIT n={n}") LOGGER.info("Train dataset %s", train_dataset.group_count) LOGGER.info("Validation dataset %s", validate_dataset.group_count) # change epochs to suit each dataset options = { "base_model_path": str(base_model_path / "model.h5"), "output_path": output_path / f"kfold_n{n}", "config": classifier.SOMClassifierConfig(**{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 15, }) } run_transfer(options, train_dataset, validate_dataset)
def save_information(self, path: utils.URLPath): """Save additional plots and information.""" # Text summary of model with (path / "model_summary.txt").open("w") as summary_file: def print_file(*args, **kwargs): print(*args, **kwargs, file=summary_file) self.model.summary(print_fn=print_file) # Image plotting structure of model keras.utils.plot_model(self.model, to_file=str(path / "model_plot.png")) # plot all training history for i, (meta, history) in enumerate(self.training_history): training_output = path / f"train_{i}" io_functions.save_json(meta, training_output / "info.json") plot_training_history(history, training_output / "training.png")
def run_denovo(options, train_dataset, validate_dataset): config = options["config"] model = train_som_classifier(train_dataset, validate_dataset, config) output = utils.URLPath(options["output_path"]) if validate_dataset: validate = model.create_sequence(validate_dataset, config.valid_batch_size) pred_arr, pred_labels = model.predict_generator(validate) true_labels = validate.true_labels pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels) io_functions.save_csv(pred_df, output / "preds.csv") io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json") generate_all_metrics(true_labels, pred_labels, config.mapping, output) model.save(output) model.save_information(output) keras.backend.clear_session() del model
def transform_dataset_to_som(som_reference: CaseSom, dataset: "CaseCollection", output: utils.URLPath): """Transform dataset into som dataste using the given reference SOM model. """ print(f"Trainsforming individual samples") data_output = output / "data" meta_output = output / "meta.json.gz" config_output = output / "config.json" data_output.mkdir() casesamples = defaultdict(list) count_samples = len(dataset) * len(som_reference.models) countlen = len(str(count_samples)) for i, (case, somsample) in enumerate( utils.time_generator_logger( som_reference.transform_generator(dataset))): sompath = data_output / f"{case.id}_t{somsample.tube}.npy" io_functions.save_som(somsample.data, sompath, save_config=False) somsample.data = None somsample.path = sompath.relative_to(data_output) print(type(somsample.path), somsample.path) casesamples[case.id].append(somsample) print( f"[{str(i + 1).rjust(countlen, ' ')}/{count_samples}] Created tube {somsample.tube} for {case.id}" ) print(f"Saving result to new collection at {output}") som_dataset = case_dataset.CaseCollection( [case.copy(samples=casesamples[case.id]) for case in dataset], data_path=data_output) som_dataset.selected_markers = { m.tube: m.model.markers for m in som_reference.models.values() } io_functions.save_case_collection(som_dataset, meta_output) io_functions.save_json(som_reference.som_config, config_output) return som_dataset
OUTPUT = utils.URLPath(f"output/{NAME}") LOGDIR = utils.URLPath(f"logs/{NAME}_{utils.create_stamp()}") INPUT = { "data": utils.URLPath("output/ungated/data"), "meta": utils.URLPath("output/samples/meta.json.gz"), } LOGGER = utils.setup_logging(LOGDIR, NAME) set_seed(SEED) dataset = io_functions.load_case_collection(INPUT["data"], INPUT["meta"]) check_dataset(dataset) train, test = dataset.create_split(0.9) io_functions.save_json(train.labels, OUTPUT / "train_ids.json") io_functions.save_json(test.labels, OUTPUT / "test_ids.json") reference = train.sample(1) LOGGER.info("Reference dataset: %s", reference) LOGGER.info("Reference labels: %s", reference.labels) model = flowcat.FlowCat() args = constants.DEFAULT_TRAIN_ARGS args["classifier"]["balance"] = None args["classifier"]["split_ratio"] = 1.0 args["classifier"]["config"].tubes = ["1", "2"] args["classifier"]["config"].train_epochs = 150 som_train, som_test = model.train(train, reference, OUTPUT,
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 0 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) # train = train.balance(20) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1) binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) xdata, ydata = sequence_to_array(trainseq) model.fit(xdata, ydata) xtest, ytest = sequence_to_array(validseq) pred_arr = model.predict(xtest) io_functions.save_joblib(binarizer, output / "binarizer.joblib") pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics( true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath, model: utils.URLPath): data, meta, soms, model = map(utils.URLPath, [ "/data/flowcat-data/mll-flowdata/decCLL-9F", "output/0-final-dataset/train.json.gz", "output/som-fix-test/soms-test/som_r4_1", "output/0-final/classifier-minmax-new", ]) dataset = io_functions.load_case_collection(data, meta) soms = som_dataset.SOMDataset.from_path(soms) model = SaliencySOMClassifier.load(model) val_dataset = model.get_validation_data(dataset) val_seq = model.create_sequence(soms) # printing out weights and biases, unsure whether they actually contain # information # in theory we could extend that to attempt to describe them as gates tube = "3" weights, biases = model.model.layers[int(tube) + 2].get_weights() for j, chname in enumerate(model.config["tubes"][tube]["channels"]): ch_mean_weight = np.mean(weights[:, :, j, :]) print(j, chname, ch_mean_weight) for i in range(weights.shape[-1]): mean_weight = np.mean(weights[:, :, :, i]) print(i, mean_weight, biases[i]) for j, chname in enumerate(model.config["tubes"]["1"]["channels"]): print(i, j, chname) print(weights[:, :, j, i]) # zero out specific columns and see how that impacts performance output = utils.URLPath("output/0-final/model-analysis/occlusion") for group in model.config["groups"]: print(group) sel_cases = val_dataset.filter(groups=[group]) avg_results = model.channel_occlusion(sel_cases, val_seq) print(sorted(avg_results, key=lambda t: t[2], reverse=True)) io_functions.save_json(avg_results, output / f"{group}_avg_std.json") # case_som = soms.get_labels([case.id]).iloc[0] hcls = val_dataset.filter(groups=["HCL"]) from collections import defaultdict max_vals = defaultdict(lambda: defaultdict(list)) mean_vals = defaultdict(lambda: defaultdict(list)) for case in hcls: print(case) gradient = model.calculate_saliency(val_seq, case, case.group, maximization=False) for i, (tube, markers) in enumerate(model.config["tubes"].items()): tgrad = gradient[i] for i, marker in enumerate(markers["channels"]): mgrad = tgrad[:, :, i] gmax = np.max(mgrad) max_vals[tube][marker].append(gmax) gmean = np.mean(mgrad) mean_vals[tube][marker].append(gmean) max_markers = defaultdict(list) for tube, markers in model.config["tubes"].items(): for marker in markers["channels"]: print("Max", tube, marker, np.mean(max_vals[tube][marker])) print("Mean", tube, marker, np.mean(mean_vals[tube][marker])) max_markers[tube].append((marker, np.mean(max_vals[tube][marker]))) for tube in model.config["tubes"]: print("Tube", tube) print("\n".join(": ".join((t[0], str(t[1]))) for t in sorted( max_markers[tube], key=lambda t: t[1], reverse=True))) c_model = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/model_0.h5" c_labels = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/test_labels.json" c_preds = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/predictions_0.csv" c_config = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/config.json" c_cases = MLLDATA / "mll-flowdata/CLL-9F" c_sommaps = MLLDATA / "mll-sommaps/sample_maps/selected1_toroid_s32" c_misclass = MLLDATA / "mll-sommaps/misclassifications/" c_tube = [1, 2] # load datasets somdataset = sd.SOMDataset.from_path(c_sommaps) cases = cc.CaseCollection.from_path(c_cases, how="case_info.json") # filter datasets test_labels = flowutils.load_json(c_labels) filtered_cases = cases.filter(labels=test_labels) somdataset.data[1] = somdataset.data[1].loc[test_labels, :] # get mapping config = flowutils.load_json(c_config) groupinfo = mappings.GROUP_MAPS[config["c_groupmap"]] dataset = cd.CombinedDataset(filtered_cases, { dd.Dataset.from_str('SOM'): somdataset, dd.Dataset.from_str('FCS'): filtered_cases }, group_names=groupinfo['groups']) # modify mapping dataset.set_mapping(groupinfo) xoutputs = [ loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=1, sel_count="counts", pad_width=1, ), loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=2, sel_count="counts", pad_width=1, ) ] dataset = loaders.DatasetSequence.from_data(dataset, xoutputs, batch_size=1, draw_method="sequential") predictions = pd.read_csv(c_preds, index_col=0) predictions = add_correct_magnitude(predictions) predictions = add_infiltration(predictions, cases) misclass_labels = ['507777582649cbed8dfb3fe552a6f34f8b6c28e3'] for label in misclass_labels: label_path = pathlib.Path(f"{c_misclass}/{label}") if not label_path.exists(): label_path.mkdir() case = cases.get_label(label) #get the actual and the predicited class corr_group = predictions.loc[case.id, "correct"] pred_group = predictions.loc[case.id, "pred"] classes = [corr_group, pred_group] gradients = plotting.calc_saliency(dataset, case, c_model, classes=classes) for tube in c_tube: heatmaps = plotting.draw_saliency_heatmap(case, gradients, classes, tube) for idx, heatmap in enumerate(heatmaps): plotting.save_figure( heatmap, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_saliency_heatmap.png" ) scatterplots = plotting.plot_tube(case, tube, gradients[tube - 1], classes=classes, sommappath=c_sommaps) for idx, scatterplot in enumerate(scatterplots): plotting.save_figure( scatterplot, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_scatterplots.png" )
print(metadata[0]) data = [meta_to_case(meta, fcs_data_path) for meta in metadata] dataset = CaseCollection(data, data_path=fcs_data_path) outpath = URLPath("output/5-berlin-data-test/dataset") save_case_collection(dataset, dataset_path / "casecollection.json") # only use groups we already have for now group_dataset, reasons = dataset.filter_reasons(groups=GROUPS) save_case_collection(group_dataset, dataset_path / "valid_groups.json") invalid_labels = [l for l, _ in reasons] invalid_dataset, _ = dataset.filter_reasons(labels=invalid_labels) save_case_collection(invalid_dataset, dataset_path / "invalid_groups.json") selected = flowcat.marker_selection.get_selected_markers(group_dataset, ("1", "2", "3", "4"), marker_threshold=0.9) save_case_collection(selected, outpath / "known_groups.json") references = selected.sample(num=1) save_json(references, output / "references.json") selected_invalid, _ = invalid_dataset.filter_reasons( selected_markers=selected.selected_markers) save_case_collection(selected_invalid, outpath / "unknown_groups.json")
somcases = [] for case in dataset: somcases.append(case.copy(samples=casesamples[case.id])) somcollection = case_dataset.CaseCollection(somcases) io_functions.save_json(somcollection, output + ".json") labels = [{"label": case.id, "randnum": 0, "group": case.group} for case in dataset] # Save metadata into an additional csv file with the same name metadata = pd.DataFrame(labels) io_functions.save_csv(metadata, output + ".csv") io_functions.save_json( { tube: { "dims": m.model.dims, "channels": m.model.markers, } for tube, m in model.models.items() }, output + "_config.json") def main(args): """Load a model with given transforming arguments and transform individual cases.""" cases = io_functions.load_case_collection(args.data, args.meta) # cases = cases.sample(1, groups=["CLL", "normal"]) selected_markers = cases.selected_markers marker_name_only = False if args.tensorboard: tensorboard_dir = args.output / "tensorboard"
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) binarizer, model = load_model(args.model) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, epochs=10, validation_data=validseq) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to fcs dataset data meta: Path to fcs dataset metainformation output: Output path """ tubes = ("1", "2") sample_size = 512 # group_mapping = mappings.GROUP_MAPS["6class"] # mapping = group_mapping["map"] mapping = None groups = mappings.GROUPS # groups = group_mapping["groups"] dataset = io_functions.load_case_collection(data, meta) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) validate, train = dataset.create_split(50) print(train.group_count) # train = train.balance(1000).shuffle() train = train.sample(100).shuffle() print(train.group_count) group_count = train.group_count group_weights = classification_utils.calculate_group_weights(group_count) group_weights = { i: group_weights.get(g, 1.0) for i, g in enumerate(groups) } io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") binarizer = LabelBinarizer() binarizer.fit(groups) train_seq = FCSSequence(train, binarizer, tubes=tubes, sample_size=sample_size, batch_size=64) validate_seq = FCSSequence(validate, binarizer, tubes=tubes, sample_size=sample_size, batch_size=128) config = { "tubes": tubes, "groups": groups, } io_functions.save_json(config, output / "config.json") # for tube in tubes: # x, y, z = selected_tubes[tube]["dims"] # selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 2, ("MBL", "normal"): 2, ("MCL", "normal"): 2, ("PL", "normal"): 2, ("LPL", "normal"): 2, ("MZL", "normal"): 2, ("FL", "normal"): 2, ("HCL", "normal"): 2, } cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) model = create_fcs_model(train_seq.xshape, train_seq.yshape, global_decay=5e-5) model.compile( # loss="categorical_crossentropy", # loss=keras.losses.CategoricalCrossentropy(), loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix), # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ "acc", # keras.metrics.CategoricalAccuracy(), # keras.metrics.TopKCategoricalAccuracy(k=2), # top2_acc, ]) model.summary() tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=20, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=train_seq, validation_data=validate_seq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validate_seq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validate_seq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to som dataset output: Output path """ tubes = ("2", "3", "4") pad_width = 1 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")]) dataset_groups = {d.group for d in dataset} # if set(groups) != dataset_groups: # raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") validate, train = dataset.create_split(10, stratify=True) group_count = train.group_count num_cases = sum(group_count.values()) balanced_nums = num_cases / len(dataset_groups) balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups] min_ratio = min(balanced_loss_weights) balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)} print(balanced_loss_weights) # train = train.balance(2000) # train = train.balance_per_group({ # "CM": 6000, # # "CLL": 4000, # # "MBL": 2000, # "MCL": 1000, # "PL": 1000, # "LPL": 1000, # "MZL": 1000, # "FL": 1000, # "HCL": 1000, # "normal": 6000, # }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=15, shuffle=True, callbacks=[tensorboard_callback, nan_callback], class_weight=balanced_loss_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups) print(groups) print(confusion) balanced = metrics.balanced_accuracy_score(true_labels, pred_labels) print(balanced)
def save_somclassifier_config(config: "SOMClassifierConfig", path: utils.URLPath): """Save configuration to the given path.""" io_functions.save_json(config.to_json(), path)
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 2 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError( f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) group_weights = None # group_count = train.group_count # group_weights = classification_utils.calculate_group_weights(group_count) # group_weights = { # i: group_weights.get(g, 1.0) for i, g in enumerate(groups) # } # train = train.balance(2000) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} # always (true, pred) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 8, ("MBL", "normal"): 8, ("MCL", "normal"): 8, ("PL", "normal"): 8, ("LPL", "normal"): 8, ("MZL", "normal"): 8, ("FL", "normal"): 8, ("HCL", "normal"): 8, } if mapping: cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v for (a, b), v in cost_mapping.items()} # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) # np.save(str(output / "cost_matrix.npy"), cost_matrix) cost_matrix = None config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-5) if cost_matrix is not None: loss = classification_utils.WeightedCategoricalCrossentropy( cost_matrix) else: loss = "categorical_crossentropy" model.compile( loss=loss, # loss="categorical_crossentropy", # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ keras.metrics.CategoricalAccuracy(), ]) with (output / "model_summary.txt").open("w") as summary_file: def print_file(*args, **kwargs): print(*args, **kwargs, file=summary_file) model.summary(print_fn=print_file) keras.utils.plot_model(model, to_file=str(output / "model_plot.png")) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence(train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence(validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) # tensorboard_dir = str(output / "tensorboard") # tensorboard_callback = keras.callbacks.TensorBoard( # log_dir=str(tensorboard_dir), # histogram_freq=5, # write_grads=True, # write_images=True, # ) nan_callback = keras.callbacks.TerminateOnNaN() history = model.fit_generator( epochs=epochs, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path) plot_training_history(history, output / "training.png")
utils.URLPath( "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json" )) OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F") setup_logging(None, "generate ref SOM for merged FCS") ref_dataset = dataset.filter(labels=references) tensorboard_dir = None # Discover channels in the given dataset markers = get_tube_marker(ref_dataset) # markers = read_sel_markers(sel_markers) print(markers) io_functions.save_json(markers, OUTPUT / "markers.json") # set marker_name_only to True ref_config = DEFAULT_REFERENCE_SOM_ARGS.copy() ref_config.marker_name_only = True model = fc_som.CaseSom( tubes=markers, tensorboard_dir=tensorboard_dir, modelargs=ref_config, ) model.train(ref_dataset) io_functions.save_casesom(model, OUTPUT / "sommodel")
def main(args): MLL5F = som_dataset.SOMDataset.from_path(args.input) OUTPUT = args.output #val_labels = args.val #train_labels = args.train #labels = args.labels LOGGER = utils.logs.setup_logging(None, "classify") groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } #vallabels = io_functions.load_json(val_labels) #validate_dataset = MLL5F.filter(labels=vallabels) #labels = io_functions.load_json(train_labels) #train_dataset = MLL5F.filter(labels=labels) #labels = io_functions.load_json(labels) #train_dataset = MLL5F.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( MLL5F, split_ratio=0.90, groups=groups, mapping=mapping, balance=None)#, val_dataset = validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) model = create_model(config.inputs, 1, global_decay=5e-3) model.compile( loss="binary_crossentropy", optimizer="adam", metrics=[ "acc", ] ) binarizer = LabelBinarizer() binarizer.fit(groups) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, validation_data=validseq, epochs=20, shuffle=True, class_weight=None) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")