def main(options):

    expdir = Path(options["expdir"])
    recipe = Path(options["recipe"])
    backend = options["backend"]
    cuda = options["cuda"]
    njobs = options["njobs"]
    overwrite = options["overwrite"]
    resume = options["resume"]

    if resume:
        logger.warning("Resume cross validation")
        overwrite = False
    if expdir.exists() and not overwrite and not resume:
        raise FileExistsError(expdir)
    elif expdir.exists() and overwrite:
        logger.warning(f"Remove expdir {expdir}")
        shutil.rmtree(expdir)

    queue = prepare_cross_validation(expdir, recipe)
    run_cross_validation(expdir,
                         queue,
                         backend=backend,
                         njobs=njobs,
                         cuda=cuda)
Exemple #2
0
def train(ctx, expdir, recipe, resume, no_eval):

    from assist.scripts import prepare_train, run_train


    if ctx.obj["n_jobs"] > 1:
        raise click.BadParameter("For more than one job, use `train_many`")

    logger = ctx.obj["logger"]
    expdir, recipe = map(Path, (expdir, recipe))

    if resume and ctx.obj["overwrite"]:
        logger.info("Setting overwrite to False")
        ctx.obj["overwrite"] = False

    if expdir.exists() and ctx.obj["overwrite"]:
        logger.warning(f"{expdir} and its contents will be deleted")
        shutil.rmtree(expdir)
    elif expdir.exists() and not resume:
        raise click.BadOptionUsage("expdir", f"{expdir} exists and none of overwrite and resume flags are set")

    if not expdir.exists():
        prepare_train(expdir, recipe)

    run_train(
        expdir,
        backend=ctx.obj["backend"],
        cuda=ctx.obj["cuda"],
        njobs=ctx.obj["n_jobs"],
        do_eval=not(no_eval)
    )
Exemple #3
0
def main(expdir, cuda):

    expdir = Path(expdir)

    #check if this experiment has been completed
    if (expdir / "model").exists():
        logger.warning(f"Found trained model in {expdir}.")
        return

    acquisitionconf = read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(os.path.join(expdir, 'structure.xml'))
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)

    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)

    trainconf = dict(
        read_config(expdir / "train.cfg",
                    default=Path(__file__).parent /
                    "defaults/train.cfg").items("train"))

    with open(expdir / "trainfeats") as trainfeats:
        features = {
            uttid: np.load(featsfile)
            for uttid, featsfile in map(parse_line, trainfeats.readlines())
        }

    with open(expdir / "traintasks") as traintasks:
        taskstrings = {
            uttid: task
            for uttid, task in map(parse_line, traintasks.readlines())
        }

    examples = {
        utt: (features[utt], taskstrings[utt])
        for utt in taskstrings if utt in features
    }
    model.train(examples)
    model.save(expdir / 'model')
Exemple #4
0
def prepare_database(ctx, expdir, recipe):

    from assist.scripts import run_prepare_database
    from assist.tools import logger

    expdir, recipe = map(Path, (expdir, recipe))

    if expdir.exists() and ctx.obj["overwrite"]:
        logger.warning(f"Deleting {expdir}")
        shutil.rmtree(expdir)
    elif expdir.exists():
        raise ValueError(f"{expdir} exists and flag overwrite not set")

    run_prepare_database(
        expdir,
        recipe,
        backend=ctx.obj["backend"],
        njobs=ctx.obj["n_jobs"],
        overwrite=ctx.obj["overwrite"]
    )
Exemple #5
0
def gridsearch(ctx, expdir, recipe, no_data_prep, resume, learning_curve):

    from assist.scripts.gridsearch import prepare_gridsearch, gs_learning_curve

    logger = ctx.obj["logger"]

    expdir, recipe = map(Path, (expdir, recipe))

    if resume and ctx.obj["overwrite"]:
        logger.warning("Setting overwrite flag to False to remain consistent")

    if expdir.exists() and ctx.obj["overwrite"]:
        logger.warning(f"{expdir} and its contents will be deleted")
        shutil.rmtree(expdir)
    elif expdir.exists() and not resume:
        raise ValueError(f"{expdir} exists and overwrite/resume/no-data-prep flags not set")

    if not expdir.exists():
        prepare_gridsearch(expdir, recipe)

    gs_func = gs_learning_curve  #if learning_curve else gridsearch
    gs_func(expdir, recipe, cuda=ctx.obj["cuda"], n_jobs=ctx.obj["n_jobs"])
Exemple #6
0
def run(expdir, backend, cuda):
    if backend == 'condor':
        os.makedirs(expdir / "outputs", exist_ok=True)
        jobfile = "assist/condor/run_script{}.job".format(
            "_GPU" if cuda else "")
        in_queue = os.popen('if condor_q -nobatch -wide | grep -q %s; '
                            'then echo true; else echo false; fi' %
                            expdir).read().strip() == 'true'

        if not in_queue:
            condor_submit = f"condor_submit expdir={expdir} script=train {jobfile}"
            logger.warning(condor_submit)
            logger.warning(
                subprocess.check_output(condor_submit.split()).decode("utf-8"))
    else:
        logger.warning("Local training started")
        train.main(expdir, cuda)
Exemple #7
0
def cross_validation(ctx, expdir, recipe, resume=False, retrain=False, clean=False):

    from assist.scripts import prepare_cross_validation, run_cross_validation

    logger = ctx.obj["logger"]

    expdir, recipe = map(Path, (expdir, recipe))

    if (resume or retrain) and ctx.obj["overwrite"]:
        logger.warning("Setting overwrite flag to False to remain consistent")

    if expdir.exists() and ctx.obj["overwrite"]:
        logger.warning(f"{expdir} and its contents will be deleted")
        shutil.rmtree(expdir)
    elif expdir.exists() and not (resume or retrain):
        raise click.BadParameter(
            f"{expdir} exists and overwrite/resume/retrain flags not set")

    if not expdir.exists():
        prepare_cross_validation(expdir, recipe)

    queue = list(expdir.rglob("*blocks_exp*"))

    if resume:
        queue = list(filter(lambda exp: not((exp/"model").exists()), queue))

    if not queue:
        logger.error("Empty queue. Was resume flag set? Is the filesystem ok?")
        return

    logger.warning(f"{len(queue)} experiments in the queue.")

    run_cross_validation(
        expdir,
        queue,
        backend=ctx.obj["backend"],
        cuda=ctx.obj["cuda"],
        njobs=ctx.obj["n_jobs"],
        clean=clean
    )
Exemple #8
0
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1):
    logger.info(f"GridSearch {expdir}")

    with open(recipe / "param_grid.json") as jsonfile:
        param_grid = json.load(jsonfile)

    logger.debug(str(param_grid))
    total_params = np.prod(list(map(len, param_grid.values())))
    logger.warning(
        f"Searching {len(param_grid)} parameters, totalling {total_params} possible values."
    )

    gsconf = read_config(expdir / "gridsearch.cfg")
    default_config = dict(gsconf["acquisition"].items())
    default_config["device"] = "cuda" if cuda else "cpu"
    gsconf = dict(gsconf["gridsearch"].items())
    logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items()))
    train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]),
                              int(gsconf["num_trains"]))
    gs_params = {
        "train_sizes":
        train_sizes,
        "cv":
        int(gsconf["cv_splits"]),
        "scoring":
        make_scorer(accuracy)
        if gsconf["scoring"] == "accuracy" else gsconf["scoring"],
        "n_jobs":
        n_jobs
    }
    logger.debug(gs_params)

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / 'structure.xml')
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    default_config["output_dim"] = coder.numlabels

    features = FeatLoader(expdir / "gridsearchfeats").to_dict()
    with open(expdir / "gridsearchtasks") as traintasks:
        taskstrings = {
            uttid: task
            for uttid, task in map(parse_line, traintasks.readlines())
        }

    indices = sorted(set(features).intersection(set(taskstrings)))
    X = list(map(features.__getitem__, indices))
    y = list(
        map(coder.encode, map(read_task, map(taskstrings.__getitem__,
                                             indices))))

    gs_results = defaultdict(list)
    start = time()
    best_score = 0
    for i, param_values in enumerate(product(*param_grid.values())):

        t0 = time()
        params = dict(zip(param_grid.keys(), param_values))
        config = deepcopy(default_config)
        config.update(params)
        logger.debug(config)

        model = RNNClassifier(**config)

        train_sizes, train_scores, valid_scores = learning_curve(
            model, X, y, **gs_params)

        train_score = auc(train_sizes, train_scores.mean(-1))
        test_score = auc(train_sizes, valid_scores.mean(-1))
        t1 = time()
        logger.info(
            f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} "
            f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}")
        gs_results["auc_test_score"].append(test_score)
        gs_results["auc_train_score"].append(train_score)
        gs_results["params"].append(params)
        gs_results["train_sizes"].append(train_sizes)
        gs_results["train_scores"].append(train_scores)
        gs_results["test_scores"].append(valid_scores)

        if test_score > best_score:
            best_params, best_score, best_index = params, test_score, i

    logger.warning(
        f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})"
    )
    logger.warning(
        f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}")

    with open(expdir / "gs_results.json", "w") as result_file:
        json.dump(
            {
                "best_params": best_params,
                "best_score": best_score,
                "cv_results": serialise(gs_results)
            }, result_file)
Exemple #9
0
def evaluate(expdir, cuda=False, clean=False):
    logger.info(f"Evaluate {expdir}")

    acquisitionconf = read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")

    coderconf = read_config(expdir / "coder.cfg")
    structure = Structure(expdir / "structure.xml")
    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)

    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    model.display(logger.info)
    model.load(expdir / 'model')
    model.display(logger.info)

    features = FeatLoader(expdir / "testfeats").to_dict()

    with open(expdir / "testtasks") as testtasks:
        references = {
            key: read_task(value)
            for key, value in map(parse_line, testtasks.readlines())
            if key in features
        }

    assert len(features) == len(references), set(features) - set(references)

    to_remove = []
    for uttid, feat in features.items():
        if not np.isfinite(feat).all():
            to_remove.append(uttid)

    if to_remove:
        logger.warning(f"Found {len(to_remove)} utterances with nan.")
        for uttid in to_remove:
            del features[uttid]
            del references[uttid]

    decoded = model.decode(features)

    assert not (set(decoded) - set(references))
    y_true = np.array([coder.encode(task) for task in references.values()])
    y_pred = np.array([coder.encode(task) for task in decoded.values()])

    TP = (y_pred == 1) & (y_true == 1)
    TN = (y_pred == 0) & (y_true == 0)
    FP = (y_pred == 1) & (y_true == 0)
    FN = (y_pred == 0) & (y_true == 1)
    error_rate = 1 - (TP | TN).all(-1).mean()
    precision = TP.sum() / (TP | FP).sum()
    recall = TP.sum() / (TP | FN).sum()
    f1_score = 2 * precision * recall / (precision + recall)
    logger.info(f"TPR={TP.sum()} TN={TN.sum()} FP={FP.sum()} FN={FN.sum()}")
    logger.info(
        f"P={precision:.2%} R={recall:.2%} F={f1_score:.2%} E={error_rate:.2%}"
    )

    for line in classification_report(y_true, y_pred).split("\n"):
        logger.info(line)

    with open(expdir / "dectasks", "w") as dectasks_file:
        dectasks_file.writelines(
            [f"{name} {to_string(task)}\n" for name, task in decoded.items()])

    metrics, scores = score(decoded, references)
    for metric_name, metric in metrics.items():
        logger.info(f"{metric_name}: {metric:.4f}")
        with open(expdir / metric_name.replace(" ", ""), "w") as f:
            f.write(str(metric))

    write_scores(scores, expdir)

    if clean:
        logger.info(f"Remove {expdir}/model")
        os.remove(expdir / 'model')
Exemple #10
0
def prepare_filesystem(expdir, speaker, coder, dataconf, expconf):

    speaker_dir = expdir / speaker
    os.makedirs(speaker_dir, exist_ok=True)

    feature_file = Path(dataconf.get(speaker, 'features'))
    with open(str(feature_file).replace(feature_file.suffix,
                                        ".scp")) as featfile:
        features = dict(map(parse_line, featfile.readlines()))

    with open(dataconf.get(speaker, "tasks")) as taskfile:
        task_strings = {
            f"{speaker}_{uttid}": task
            for uttid, task in map(parse_line, taskfile.readlines())
        }
        for uttid in list(task_strings):
            if uttid not in features:
                logger.warning(f"Missing utterance speaker {speaker}: {uttid}")
                del task_strings[uttid]

        tasks = [
            coder.encode(read_task(task)) for task in task_strings.values()
        ]

    if not tasks:
        logger.error(f"Error with speaker {speaker}: no tasks")
        return []

    tasks = np.array(tasks)
    blocks_path = speaker_dir / "blocks.pkl"
    if blocks_path.exists():
        with open(blocks_path, "rb") as blockfile:
            blocks = pickle.load(blockfile)
    else:
        try:
            blocks = make_blocks(tasks, expconf, feature_file.parent)
        except Exception as err:
            logger.error(f"Error with speaker {speaker}: {err}")
            return []
        with open(blocks_path, "wb") as blockfile:
            pickle.dump(blocks, blockfile)

    num_exp = int(expconf["numexp"])

    train_ids, test_ids = [], []
    for block_id in range(len(blocks) - 1):
        train_ids.append([])
        test_ids.append([])
        for exp_id in range(num_exp):
            train_ids[-1].append(
                list(
                    itertools.chain.from_iterable(
                        random.sample(blocks, block_id + 1))))
            test_ids[-1].append(
                [i for i in range(len(tasks)) if i not in train_ids[-1][-1]])

    if not (train_ids and test_ids):
        logger.error(f"Error with speaker {speaker}: no utterances")
        return []

    uttids = list(task_strings)
    block_id = int(expconf['startblocks']) - 1
    while True:

        dirname = f"{block_id + 1}blocks_exp"
        num_exp = int(expconf['numexp'])
        for exp_id in range(num_exp):

            subexpdir = expdir / speaker / (dirname + str(exp_id))
            logger.debug(f"Experiment {subexpdir.name}")

            if (subexpdir / "f1").exists():
                logger.info(f"Skipping {subexpdir}")
                continue

            os.makedirs(subexpdir, exist_ok=True)

            for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"):
                symlink(f"../../{filename}",
                        subexpdir / filename,
                        relative=True)

            if not (subexpdir / "trainfeats").exists():
                for subset, ids in [("train", train_ids), ("test", test_ids)]:
                    utts = [
                        uttids[idx] for idx in ids[block_id][exp_id]
                        if idx < len(uttids)
                    ]
                    if len(utts) != len(ids[block_id][exp_id]):
                        num_lost = len(ids[block_id][exp_id]) - len(utts)
                        logger.warning(f"Lost {num_lost} {subset} utterances")
                    logger.debug(f"Number of {subset} examples: {len(utts):,}")

                    writefile(subexpdir / f"{subset}feats",
                              {utt: features[utt]
                               for utt in utts})
                    writefile(subexpdir / f"{subset}tasks",
                              {utt: task_strings[utt]
                               for utt in utts})

        next_block_id = (block_id + 1) * int(expconf['scale']) + int(
            expconf['increment']) - 1
        next_block_id = min(next_block_id, len(blocks) - 2)
        if block_id == next_block_id:
            break
        else:
            block_id = next_block_id
Exemple #11
0
def main(expdir, cuda):

    expdir = Path(expdir)
    if (expdir / "f1").exists():
        logger.info(f"Results found at {expdir}")
        return

    logger.info(f"Evaluate {expdir}")

    acquisitionconf = tools.read_config(expdir / "acquisition.cfg")
    acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu")
    coderconf = tools.read_config(expdir / "coder.cfg")
    structure = Structure(expdir / "structure.xml")

    Coder = coder_factory(coderconf.get('coder', 'name'))
    coder = Coder(structure, coderconf)
    Model = model_factory(acquisitionconf.get('acquisition', 'name'))
    model = Model(acquisitionconf, coder, expdir)
    logger.debug(f"Loading model at {expdir}/model")
    model.load(expdir / 'model')

    with open(expdir / "testfeats") as testfeats:
        features = {
            line[0]: np.load(line[1])
            for line in map(tools.parse_line, testfeats.readlines())
        }

    with open(expdir / "testtasks") as testtasks:
        references = {
            key: read_task(value)
            for key, value in map(tools.parse_line, testtasks.readlines())
            if key in features
        }

    assert len(features) == len(references)

    #decode the test utterances
    feats = deepcopy(features)
    errors, nans, too_small = 0, 0, 0
    for uttid, feat in feats.items():
        remove = False
        # if feat.shape[0] < 5:
        #     too_small += 1
        #     remove = True
        if not np.isfinite(feat).all():
            nans += 1
            remove = True
        if remove:
            logger.debug(f"Removing {uttid}")
            errors += 1
            del features[uttid]
            del references[uttid]

    if errors > 0:
        logger.warning(
            f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)"
        )

    decoded = model.decode(features)

    with open(expdir / "dectasks", "w") as dectasks_file:
        dectasks_file.writelines(
            [f"{name}  {to_string(task)}\n" for name, task in decoded.items()])

    metric_names = [
        "precision", "recal", "f1", "macro precision", "macro recal",
        "macro f1"
    ]
    metrics, scores = score(decoded, references)

    for metric_name, metric in zip(metric_names, metrics):
        logger.info(f"{metric_name}: {metric:.4f}")
        with open(expdir / metric_name.replace(" ", ""), "w") as f:
            f.write(str(metric))

    write_scores(scores, expdir)