def main(options): expdir = Path(options["expdir"]) recipe = Path(options["recipe"]) backend = options["backend"] cuda = options["cuda"] njobs = options["njobs"] overwrite = options["overwrite"] resume = options["resume"] if resume: logger.warning("Resume cross validation") overwrite = False if expdir.exists() and not overwrite and not resume: raise FileExistsError(expdir) elif expdir.exists() and overwrite: logger.warning(f"Remove expdir {expdir}") shutil.rmtree(expdir) queue = prepare_cross_validation(expdir, recipe) run_cross_validation(expdir, queue, backend=backend, njobs=njobs, cuda=cuda)
def train(ctx, expdir, recipe, resume, no_eval): from assist.scripts import prepare_train, run_train if ctx.obj["n_jobs"] > 1: raise click.BadParameter("For more than one job, use `train_many`") logger = ctx.obj["logger"] expdir, recipe = map(Path, (expdir, recipe)) if resume and ctx.obj["overwrite"]: logger.info("Setting overwrite to False") ctx.obj["overwrite"] = False if expdir.exists() and ctx.obj["overwrite"]: logger.warning(f"{expdir} and its contents will be deleted") shutil.rmtree(expdir) elif expdir.exists() and not resume: raise click.BadOptionUsage("expdir", f"{expdir} exists and none of overwrite and resume flags are set") if not expdir.exists(): prepare_train(expdir, recipe) run_train( expdir, backend=ctx.obj["backend"], cuda=ctx.obj["cuda"], njobs=ctx.obj["n_jobs"], do_eval=not(no_eval) )
def main(expdir, cuda): expdir = Path(expdir) #check if this experiment has been completed if (expdir / "model").exists(): logger.warning(f"Found trained model in {expdir}.") return acquisitionconf = read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = read_config(expdir / "coder.cfg") structure = Structure(os.path.join(expdir, 'structure.xml')) Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) trainconf = dict( read_config(expdir / "train.cfg", default=Path(__file__).parent / "defaults/train.cfg").items("train")) with open(expdir / "trainfeats") as trainfeats: features = { uttid: np.load(featsfile) for uttid, featsfile in map(parse_line, trainfeats.readlines()) } with open(expdir / "traintasks") as traintasks: taskstrings = { uttid: task for uttid, task in map(parse_line, traintasks.readlines()) } examples = { utt: (features[utt], taskstrings[utt]) for utt in taskstrings if utt in features } model.train(examples) model.save(expdir / 'model')
def prepare_database(ctx, expdir, recipe): from assist.scripts import run_prepare_database from assist.tools import logger expdir, recipe = map(Path, (expdir, recipe)) if expdir.exists() and ctx.obj["overwrite"]: logger.warning(f"Deleting {expdir}") shutil.rmtree(expdir) elif expdir.exists(): raise ValueError(f"{expdir} exists and flag overwrite not set") run_prepare_database( expdir, recipe, backend=ctx.obj["backend"], njobs=ctx.obj["n_jobs"], overwrite=ctx.obj["overwrite"] )
def gridsearch(ctx, expdir, recipe, no_data_prep, resume, learning_curve): from assist.scripts.gridsearch import prepare_gridsearch, gs_learning_curve logger = ctx.obj["logger"] expdir, recipe = map(Path, (expdir, recipe)) if resume and ctx.obj["overwrite"]: logger.warning("Setting overwrite flag to False to remain consistent") if expdir.exists() and ctx.obj["overwrite"]: logger.warning(f"{expdir} and its contents will be deleted") shutil.rmtree(expdir) elif expdir.exists() and not resume: raise ValueError(f"{expdir} exists and overwrite/resume/no-data-prep flags not set") if not expdir.exists(): prepare_gridsearch(expdir, recipe) gs_func = gs_learning_curve #if learning_curve else gridsearch gs_func(expdir, recipe, cuda=ctx.obj["cuda"], n_jobs=ctx.obj["n_jobs"])
def run(expdir, backend, cuda): if backend == 'condor': os.makedirs(expdir / "outputs", exist_ok=True) jobfile = "assist/condor/run_script{}.job".format( "_GPU" if cuda else "") in_queue = os.popen('if condor_q -nobatch -wide | grep -q %s; ' 'then echo true; else echo false; fi' % expdir).read().strip() == 'true' if not in_queue: condor_submit = f"condor_submit expdir={expdir} script=train {jobfile}" logger.warning(condor_submit) logger.warning( subprocess.check_output(condor_submit.split()).decode("utf-8")) else: logger.warning("Local training started") train.main(expdir, cuda)
def cross_validation(ctx, expdir, recipe, resume=False, retrain=False, clean=False): from assist.scripts import prepare_cross_validation, run_cross_validation logger = ctx.obj["logger"] expdir, recipe = map(Path, (expdir, recipe)) if (resume or retrain) and ctx.obj["overwrite"]: logger.warning("Setting overwrite flag to False to remain consistent") if expdir.exists() and ctx.obj["overwrite"]: logger.warning(f"{expdir} and its contents will be deleted") shutil.rmtree(expdir) elif expdir.exists() and not (resume or retrain): raise click.BadParameter( f"{expdir} exists and overwrite/resume/retrain flags not set") if not expdir.exists(): prepare_cross_validation(expdir, recipe) queue = list(expdir.rglob("*blocks_exp*")) if resume: queue = list(filter(lambda exp: not((exp/"model").exists()), queue)) if not queue: logger.error("Empty queue. Was resume flag set? Is the filesystem ok?") return logger.warning(f"{len(queue)} experiments in the queue.") run_cross_validation( expdir, queue, backend=ctx.obj["backend"], cuda=ctx.obj["cuda"], njobs=ctx.obj["n_jobs"], clean=clean )
def gs_learning_curve(expdir, recipe, cuda=True, n_jobs=1): logger.info(f"GridSearch {expdir}") with open(recipe / "param_grid.json") as jsonfile: param_grid = json.load(jsonfile) logger.debug(str(param_grid)) total_params = np.prod(list(map(len, param_grid.values()))) logger.warning( f"Searching {len(param_grid)} parameters, totalling {total_params} possible values." ) gsconf = read_config(expdir / "gridsearch.cfg") default_config = dict(gsconf["acquisition"].items()) default_config["device"] = "cuda" if cuda else "cpu" gsconf = dict(gsconf["gridsearch"].items()) logger.debug(" ".join(f"{k}={v}" for k, v in gsconf.items())) train_sizes = np.linspace(float(gsconf["nmin"]), float(gsconf["nmax"]), int(gsconf["num_trains"])) gs_params = { "train_sizes": train_sizes, "cv": int(gsconf["cv_splits"]), "scoring": make_scorer(accuracy) if gsconf["scoring"] == "accuracy" else gsconf["scoring"], "n_jobs": n_jobs } logger.debug(gs_params) coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / 'structure.xml') Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) default_config["output_dim"] = coder.numlabels features = FeatLoader(expdir / "gridsearchfeats").to_dict() with open(expdir / "gridsearchtasks") as traintasks: taskstrings = { uttid: task for uttid, task in map(parse_line, traintasks.readlines()) } indices = sorted(set(features).intersection(set(taskstrings))) X = list(map(features.__getitem__, indices)) y = list( map(coder.encode, map(read_task, map(taskstrings.__getitem__, indices)))) gs_results = defaultdict(list) start = time() best_score = 0 for i, param_values in enumerate(product(*param_grid.values())): t0 = time() params = dict(zip(param_grid.keys(), param_values)) config = deepcopy(default_config) config.update(params) logger.debug(config) model = RNNClassifier(**config) train_sizes, train_scores, valid_scores = learning_curve( model, X, y, **gs_params) train_score = auc(train_sizes, train_scores.mean(-1)) test_score = auc(train_sizes, valid_scores.mean(-1)) t1 = time() logger.info( f"model {i+1}/{total_params}: train={train_score:.3%} test={test_score:.3%} " f"time={t1 - t0:.1f}s elapsed={t1-start:.1f}s {params}") gs_results["auc_test_score"].append(test_score) gs_results["auc_train_score"].append(train_score) gs_results["params"].append(params) gs_results["train_sizes"].append(train_sizes) gs_results["train_scores"].append(train_scores) gs_results["test_scores"].append(valid_scores) if test_score > best_score: best_params, best_score, best_index = params, test_score, i logger.warning( f"Search completed in {time() - start:.2f}s. Best model: {best_params} ({best_score:.2%})" ) logger.warning( f"Test scores: {gs_results['test_scores'][best_index].mean(-1)}") with open(expdir / "gs_results.json", "w") as result_file: json.dump( { "best_params": best_params, "best_score": best_score, "cv_results": serialise(gs_results) }, result_file)
def evaluate(expdir, cuda=False, clean=False): logger.info(f"Evaluate {expdir}") acquisitionconf = read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = read_config(expdir / "coder.cfg") structure = Structure(expdir / "structure.xml") Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) model.display(logger.info) model.load(expdir / 'model') model.display(logger.info) features = FeatLoader(expdir / "testfeats").to_dict() with open(expdir / "testtasks") as testtasks: references = { key: read_task(value) for key, value in map(parse_line, testtasks.readlines()) if key in features } assert len(features) == len(references), set(features) - set(references) to_remove = [] for uttid, feat in features.items(): if not np.isfinite(feat).all(): to_remove.append(uttid) if to_remove: logger.warning(f"Found {len(to_remove)} utterances with nan.") for uttid in to_remove: del features[uttid] del references[uttid] decoded = model.decode(features) assert not (set(decoded) - set(references)) y_true = np.array([coder.encode(task) for task in references.values()]) y_pred = np.array([coder.encode(task) for task in decoded.values()]) TP = (y_pred == 1) & (y_true == 1) TN = (y_pred == 0) & (y_true == 0) FP = (y_pred == 1) & (y_true == 0) FN = (y_pred == 0) & (y_true == 1) error_rate = 1 - (TP | TN).all(-1).mean() precision = TP.sum() / (TP | FP).sum() recall = TP.sum() / (TP | FN).sum() f1_score = 2 * precision * recall / (precision + recall) logger.info(f"TPR={TP.sum()} TN={TN.sum()} FP={FP.sum()} FN={FN.sum()}") logger.info( f"P={precision:.2%} R={recall:.2%} F={f1_score:.2%} E={error_rate:.2%}" ) for line in classification_report(y_true, y_pred).split("\n"): logger.info(line) with open(expdir / "dectasks", "w") as dectasks_file: dectasks_file.writelines( [f"{name} {to_string(task)}\n" for name, task in decoded.items()]) metrics, scores = score(decoded, references) for metric_name, metric in metrics.items(): logger.info(f"{metric_name}: {metric:.4f}") with open(expdir / metric_name.replace(" ", ""), "w") as f: f.write(str(metric)) write_scores(scores, expdir) if clean: logger.info(f"Remove {expdir}/model") os.remove(expdir / 'model')
def prepare_filesystem(expdir, speaker, coder, dataconf, expconf): speaker_dir = expdir / speaker os.makedirs(speaker_dir, exist_ok=True) feature_file = Path(dataconf.get(speaker, 'features')) with open(str(feature_file).replace(feature_file.suffix, ".scp")) as featfile: features = dict(map(parse_line, featfile.readlines())) with open(dataconf.get(speaker, "tasks")) as taskfile: task_strings = { f"{speaker}_{uttid}": task for uttid, task in map(parse_line, taskfile.readlines()) } for uttid in list(task_strings): if uttid not in features: logger.warning(f"Missing utterance speaker {speaker}: {uttid}") del task_strings[uttid] tasks = [ coder.encode(read_task(task)) for task in task_strings.values() ] if not tasks: logger.error(f"Error with speaker {speaker}: no tasks") return [] tasks = np.array(tasks) blocks_path = speaker_dir / "blocks.pkl" if blocks_path.exists(): with open(blocks_path, "rb") as blockfile: blocks = pickle.load(blockfile) else: try: blocks = make_blocks(tasks, expconf, feature_file.parent) except Exception as err: logger.error(f"Error with speaker {speaker}: {err}") return [] with open(blocks_path, "wb") as blockfile: pickle.dump(blocks, blockfile) num_exp = int(expconf["numexp"]) train_ids, test_ids = [], [] for block_id in range(len(blocks) - 1): train_ids.append([]) test_ids.append([]) for exp_id in range(num_exp): train_ids[-1].append( list( itertools.chain.from_iterable( random.sample(blocks, block_id + 1)))) test_ids[-1].append( [i for i in range(len(tasks)) if i not in train_ids[-1][-1]]) if not (train_ids and test_ids): logger.error(f"Error with speaker {speaker}: no utterances") return [] uttids = list(task_strings) block_id = int(expconf['startblocks']) - 1 while True: dirname = f"{block_id + 1}blocks_exp" num_exp = int(expconf['numexp']) for exp_id in range(num_exp): subexpdir = expdir / speaker / (dirname + str(exp_id)) logger.debug(f"Experiment {subexpdir.name}") if (subexpdir / "f1").exists(): logger.info(f"Skipping {subexpdir}") continue os.makedirs(subexpdir, exist_ok=True) for filename in ("acquisition.cfg", "coder.cfg", "structure.xml"): symlink(f"../../{filename}", subexpdir / filename, relative=True) if not (subexpdir / "trainfeats").exists(): for subset, ids in [("train", train_ids), ("test", test_ids)]: utts = [ uttids[idx] for idx in ids[block_id][exp_id] if idx < len(uttids) ] if len(utts) != len(ids[block_id][exp_id]): num_lost = len(ids[block_id][exp_id]) - len(utts) logger.warning(f"Lost {num_lost} {subset} utterances") logger.debug(f"Number of {subset} examples: {len(utts):,}") writefile(subexpdir / f"{subset}feats", {utt: features[utt] for utt in utts}) writefile(subexpdir / f"{subset}tasks", {utt: task_strings[utt] for utt in utts}) next_block_id = (block_id + 1) * int(expconf['scale']) + int( expconf['increment']) - 1 next_block_id = min(next_block_id, len(blocks) - 2) if block_id == next_block_id: break else: block_id = next_block_id
def main(expdir, cuda): expdir = Path(expdir) if (expdir / "f1").exists(): logger.info(f"Results found at {expdir}") return logger.info(f"Evaluate {expdir}") acquisitionconf = tools.read_config(expdir / "acquisition.cfg") acquisitionconf.set("acquisition", "device", "cuda" if cuda else "cpu") coderconf = tools.read_config(expdir / "coder.cfg") structure = Structure(expdir / "structure.xml") Coder = coder_factory(coderconf.get('coder', 'name')) coder = Coder(structure, coderconf) Model = model_factory(acquisitionconf.get('acquisition', 'name')) model = Model(acquisitionconf, coder, expdir) logger.debug(f"Loading model at {expdir}/model") model.load(expdir / 'model') with open(expdir / "testfeats") as testfeats: features = { line[0]: np.load(line[1]) for line in map(tools.parse_line, testfeats.readlines()) } with open(expdir / "testtasks") as testtasks: references = { key: read_task(value) for key, value in map(tools.parse_line, testtasks.readlines()) if key in features } assert len(features) == len(references) #decode the test utterances feats = deepcopy(features) errors, nans, too_small = 0, 0, 0 for uttid, feat in feats.items(): remove = False # if feat.shape[0] < 5: # too_small += 1 # remove = True if not np.isfinite(feat).all(): nans += 1 remove = True if remove: logger.debug(f"Removing {uttid}") errors += 1 del features[uttid] del references[uttid] if errors > 0: logger.warning( f"{errors}/{len(feats)} utts removed ({too_small} too small and {nans} contained NaN)" ) decoded = model.decode(features) with open(expdir / "dectasks", "w") as dectasks_file: dectasks_file.writelines( [f"{name} {to_string(task)}\n" for name, task in decoded.items()]) metric_names = [ "precision", "recal", "f1", "macro precision", "macro recal", "macro f1" ] metrics, scores = score(decoded, references) for metric_name, metric in zip(metric_names, metrics): logger.info(f"{metric_name}: {metric:.4f}") with open(expdir / metric_name.replace(" ", ""), "w") as f: f.write(str(metric)) write_scores(scores, expdir)