Beispiel #1
0
def train(data: utils.URLPath, output: utils.URLPath):
    """Train a new classifier using SOM data."""
    groups = GROUPS
    tubes = ("1", "2", "3")
    balance = {
        "CLL": 4000,
        "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    }
    mapping = None
    dataset = som_dataset.SOMDataset.from_path(data)
    train_dataset, validate_dataset = prepare_classifier_train_dataset(
        dataset, groups=groups, mapping=mapping, balance=balance)

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
        })
    model = train_som_classifier(train_dataset, validate_dataset, config)

    model.save(output)
    model.save_information(output)
Beispiel #2
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output
    PANEL = args.panel
    bal = args.bal

    # set the groups according to the panel
    if PANEL == "MLL":
        groups = GROUPS
    elif PANEL == "ERLANGEN":
        groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    tubes = ("1")
    mapping = None

    balance = dict((key, bal) for key in groups)

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
            "train_epochs": 20,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    model = fc_api.train_som_classifier(train_dataset, validate_dataset,
                                        config)

    model.save(OUTPUT)
    model.save_information(OUTPUT)
Beispiel #3
0
def run_kfold(*, output_path, base_model_path, som_dataset_path, k_number=5, panel="MLL", rerun=False, stratified=False,):
    if not rerun and output_path.exists():
        LOGGER.info("Existing results exist at %s skipping", output_path)
        return

    args = locals()
    io_functions.save_json(args, output_path / "params.json")

    # set the groups according to the panel
    if panel == "MLL":
        groups = GROUPS
    elif panel == "ERLANGEN":
         groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    # tubes to be processed for merged samples
    tubes = ("1")

    mapping = {"groups": groups, "map": None}

    dataset = som_dataset.SOMDataset.from_path(som_dataset_path)
    LOGGER.info("Full dataset %s", dataset.group_count)

    splits = create_kfold_split(dataset, k_number=k_number, stratified=stratified)

    for n, (train_dataset, validate_dataset) in enumerate(splits):
        LOGGER.info(f"SPLIT n={n}")
        LOGGER.info("Train dataset %s", train_dataset.group_count)
        LOGGER.info("Validation dataset %s", validate_dataset.group_count)

        # change epochs to suit each dataset
        options = {
            "base_model_path": str(base_model_path / "model.h5"),
            "output_path": output_path / f"kfold_n{n}",
            "config": classifier.SOMClassifierConfig(**{
                "tubes": {tube: dataset.config[tube] for tube in tubes},
                "groups": groups,
                "pad_width": 2,
                "mapping": mapping,
                "cost_matrix": None,
                "train_epochs": 15,
            })
        }
        run_transfer(options, train_dataset, validate_dataset)
groups = GROUPS
tubes = ("1")
#tubes = ("1", "2")

mapping = None
dataset = som_dataset.SOMDataset.from_path(SOM_DATASET)

train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
    dataset, split_ratio=0.3, groups=groups, mapping=mapping, balance=None)
labels_dict = train_dataset.group_count

config = classifier.SOMClassifierConfig(
    **{
        "tubes": {tube: dataset.config[tube]
                  for tube in tubes},
        "groups": groups,
        "pad_width": 2,
        "mapping": mapping,
        "cost_matrix": None,
    })

class_weight = create_class_weight(labels_dict)
#class_weight = utils.classification_utils.calculate_group_weights(labels_dict)
class_weight = {i: class_weight.get(g, 1.0) for i, g in enumerate(groups)}
print(class_weight)
model = fc_api.train_som_classifier(train_dataset,
                                    validate_dataset,
                                    config,
                                    class_weights=class_weight)

model.save(OUTPUT)
Beispiel #5
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    binarizer, model = load_model(args.model)

    trainseq = som_dataset.SOMSequence(train_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)

    model.fit_generator(generator=trainseq,
                        epochs=10,
                        validation_data=validseq)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels,
                           OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
Beispiel #6
0
def main(args):
    MLL5F = som_dataset.SOMDataset.from_path(args.input)
    OUTPUT = args.output
    #val_labels = args.val
    #train_labels = args.train
    #labels = args.labels
    LOGGER = utils.logs.setup_logging(None, "classify")

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    #vallabels = io_functions.load_json(val_labels)
    #validate_dataset = MLL5F.filter(labels=vallabels)

    #labels = io_functions.load_json(train_labels)
    #train_dataset = MLL5F.filter(labels=labels)

    #labels = io_functions.load_json(labels)
    #train_dataset = MLL5F.filter(labels=labels)

   
    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        MLL5F,
        split_ratio=0.90,
        groups=groups,
        mapping=mapping,
        balance=None)#, val_dataset = validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes},
                                               "groups": groups,
                                               "pad_width": 2,
                                               "mapping": mapping,
                                               "cost_matrix": None,
                                               })

    model = create_model(config.inputs, 1, global_decay=5e-3)

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            "acc",
        ]
    )

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)

    model.fit_generator(generator=trainseq, validation_data=validseq,
                                epochs=20, shuffle=True, class_weight=None)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
Beispiel #7
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output
    PANEL = args.panel
    basemodel = args.basemodel
    bal = args.bal

    # set the groups according to the panel
    if PANEL == "MLL":
        groups = GROUPS
    elif PANEL == "ERLANGEN":
        groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
    else:
        groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]

    tubes = ("1")
    mapping = None

    balance = dict((key, bal) for key in groups)

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
            "train_epochs": 15,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    # load base model and get weights
    base_model = models.load_model(str(basemodel / "model.h5"))
    weights = base_model.get_weights()

    # create model
    model = create_model(config.inputs, config.output)

    model.set_weights(weights)

    # freeze 2 dense layers: check for each dataset
    model.get_layer('dense_1').trainable = False
    model.get_layer('dense_2').trainable = False

    model.compile(loss=config.get_loss(modeldir=None),
                  optimizer="adam",
                  metrics=["accuracy"])

    # cast to SOMConfig instance
    model = SOMClassifier(config, model)

    train = model.create_sequence(train_dataset, config.train_batch_size)

    if validate_dataset is not None:
        validate = model.create_sequence(validate_dataset,
                                         config.valid_batch_size)
    else:
        validate = None

    model.train_generator(train,
                          validate,
                          epochs=config.train_epochs,
                          class_weight=None)

    model.save(OUTPUT)
    model.save_information(OUTPUT)