def _init_corpus(hconf): """Start evaluation; generate folds if needed :rtype: DataConfig or None """ can_skip_folds = fp.exists(hconf.fold_file) msg_skip_folds = ('Skipping generation of fold files ' '(must have been jumpstarted)') if hconf.runcfg.stage is None: # standalone: we always have to load the datapack # because we'll need it for further stages mpack = _load_harness_multipack(hconf) if can_skip_folds: print(msg_skip_folds, file=sys.stderr) fold_dict = load_fold_dict(hconf.fold_file) else: fold_dict = hconf.create_folds(mpack) return DataConfig(pack=mpack, folds=fold_dict) elif hconf.runcfg.stage == ClusterStage.start: if can_skip_folds: # if we are just running --start and the fold file already # exists we can even bail out before reading the datapacks # because that's all we wanted them for print(msg_skip_folds, file=sys.stderr) else: mpack = _load_harness_multipack(hconf) hconf.create_folds(mpack) return None else: # any other stage: fold files have already been # created so we just read them in return DataConfig(pack=_load_harness_multipack(hconf), folds=load_fold_dict(hconf.fold_file))
def _do_corpus(lconf): "Run evaluation on a corpus" print(_corpus_banner(lconf), file=sys.stderr) edus_file = edu_input_path(lconf) if not os.path.exists(edus_file): exit_ungathered() has_stripped = (lconf.stage in [ClusterStage.end, ClusterStage.start] and fp.exists(features_path(lconf, stripped=True))) dpack = load_data_pack(edus_file, pairings_path(lconf), features_path(lconf, stripped=has_stripped), verbose=True) if _is_standalone_or(lconf, ClusterStage.start): _generate_fold_file(lconf, dpack) dconf = DataConfig(pack=dpack, folds=load_fold_dict(lconf.fold_file)) if _is_standalone_or(lconf, ClusterStage.main): foldset = lconf.folds if lconf.folds is not None\ else frozenset(dconf.folds.values()) for fold in foldset: _do_fold(lconf, dconf, fold) if _is_standalone_or(lconf, ClusterStage.combined_models): mk_combined_models(lconf, dconf) if _is_standalone_or(lconf, ClusterStage.end): mk_global_report(lconf, dconf)
def create_folds(self, mpack): """ Generate the folds file; return the resulting folds """ if FIXED_FOLD_FILE is None: rng = mk_rng() fold_dict = make_n_fold(mpack, 10, rng) else: fold_dict = load_fold_dict(FIXED_FOLD_FILE) save_fold_dict(fold_dict, self.fold_file) return fold_dict