def save(self, path: utils.URLPath): """Save the given classifier model to the given path.""" save_somclassifier_config(self.config, path / "config.json") self.model.save(str(path / "model.h5")) io_functions.save_joblib(self.binarizer, path / "binarizer.joblib") io_functions.save_json(self.data_ids["validation"], path / "ids_validate.json") io_functions.save_json(self.data_ids["train"], path / "ids_train.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 2 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError( f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) group_weights = None # group_count = train.group_count # group_weights = classification_utils.calculate_group_weights(group_count) # group_weights = { # i: group_weights.get(g, 1.0) for i, g in enumerate(groups) # } # train = train.balance(2000) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} # always (true, pred) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 8, ("MBL", "normal"): 8, ("MCL", "normal"): 8, ("PL", "normal"): 8, ("LPL", "normal"): 8, ("MZL", "normal"): 8, ("FL", "normal"): 8, ("HCL", "normal"): 8, } if mapping: cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v for (a, b), v in cost_mapping.items()} # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) # np.save(str(output / "cost_matrix.npy"), cost_matrix) cost_matrix = None config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-5) if cost_matrix is not None: loss = classification_utils.WeightedCategoricalCrossentropy( cost_matrix) else: loss = "categorical_crossentropy" model.compile( loss=loss, # loss="categorical_crossentropy", # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ keras.metrics.CategoricalAccuracy(), ]) with (output / "model_summary.txt").open("w") as summary_file: def print_file(*args, **kwargs): print(*args, **kwargs, file=summary_file) model.summary(print_fn=print_file) keras.utils.plot_model(model, to_file=str(output / "model_plot.png")) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence(train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence(validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) # tensorboard_dir = str(output / "tensorboard") # tensorboard_callback = keras.callbacks.TensorBoard( # log_dir=str(tensorboard_dir), # histogram_freq=5, # write_grads=True, # write_images=True, # ) nan_callback = keras.callbacks.TerminateOnNaN() history = model.fit_generator( epochs=epochs, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path) plot_training_history(history, output / "training.png")
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 0 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) # train = train.balance(20) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1) binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) xdata, ydata = sequence_to_array(trainseq) model.fit(xdata, ydata) xtest, ytest = sequence_to_array(validseq) pred_arr = model.predict(xtest) io_functions.save_joblib(binarizer, output / "binarizer.joblib") pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics( true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) binarizer, model = load_model(args.model) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, epochs=10, validation_data=validseq) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to fcs dataset data meta: Path to fcs dataset metainformation output: Output path """ tubes = ("1", "2") sample_size = 512 # group_mapping = mappings.GROUP_MAPS["6class"] # mapping = group_mapping["map"] mapping = None groups = mappings.GROUPS # groups = group_mapping["groups"] dataset = io_functions.load_case_collection(data, meta) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) validate, train = dataset.create_split(50) print(train.group_count) # train = train.balance(1000).shuffle() train = train.sample(100).shuffle() print(train.group_count) group_count = train.group_count group_weights = classification_utils.calculate_group_weights(group_count) group_weights = { i: group_weights.get(g, 1.0) for i, g in enumerate(groups) } io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") binarizer = LabelBinarizer() binarizer.fit(groups) train_seq = FCSSequence(train, binarizer, tubes=tubes, sample_size=sample_size, batch_size=64) validate_seq = FCSSequence(validate, binarizer, tubes=tubes, sample_size=sample_size, batch_size=128) config = { "tubes": tubes, "groups": groups, } io_functions.save_json(config, output / "config.json") # for tube in tubes: # x, y, z = selected_tubes[tube]["dims"] # selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 2, ("MBL", "normal"): 2, ("MCL", "normal"): 2, ("PL", "normal"): 2, ("LPL", "normal"): 2, ("MZL", "normal"): 2, ("FL", "normal"): 2, ("HCL", "normal"): 2, } cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) model = create_fcs_model(train_seq.xshape, train_seq.yshape, global_decay=5e-5) model.compile( # loss="categorical_crossentropy", # loss=keras.losses.CategoricalCrossentropy(), loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix), # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ "acc", # keras.metrics.CategoricalAccuracy(), # keras.metrics.TopKCategoricalAccuracy(k=2), # top2_acc, ]) model.summary() tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=20, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=train_seq, validation_data=validate_seq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validate_seq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validate_seq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to som dataset output: Output path """ tubes = ("2", "3", "4") pad_width = 1 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")]) dataset_groups = {d.group for d in dataset} # if set(groups) != dataset_groups: # raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") validate, train = dataset.create_split(10, stratify=True) group_count = train.group_count num_cases = sum(group_count.values()) balanced_nums = num_cases / len(dataset_groups) balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups] min_ratio = min(balanced_loss_weights) balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)} print(balanced_loss_weights) # train = train.balance(2000) # train = train.balance_per_group({ # "CM": 6000, # # "CLL": 4000, # # "MBL": 2000, # "MCL": 1000, # "PL": 1000, # "LPL": 1000, # "MZL": 1000, # "FL": 1000, # "HCL": 1000, # "normal": 6000, # }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=15, shuffle=True, callbacks=[tensorboard_callback, nan_callback], class_weight=balanced_loss_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups) print(groups) print(confusion) balanced = metrics.balanced_accuracy_score(true_labels, pred_labels) print(balanced)
def main(args): MLL5F = som_dataset.SOMDataset.from_path(args.input) OUTPUT = args.output #val_labels = args.val #train_labels = args.train #labels = args.labels LOGGER = utils.logs.setup_logging(None, "classify") groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } #vallabels = io_functions.load_json(val_labels) #validate_dataset = MLL5F.filter(labels=vallabels) #labels = io_functions.load_json(train_labels) #train_dataset = MLL5F.filter(labels=labels) #labels = io_functions.load_json(labels) #train_dataset = MLL5F.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( MLL5F, split_ratio=0.90, groups=groups, mapping=mapping, balance=None)#, val_dataset = validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) model = create_model(config.inputs, 1, global_decay=5e-3) model.compile( loss="binary_crossentropy", optimizer="adam", metrics=[ "acc", ] ) binarizer = LabelBinarizer() binarizer.fit(groups) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, validation_data=validseq, epochs=20, shuffle=True, class_weight=None) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")