def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.skip_training: tdir = latest_tmp() else: tdir = current_tmp() extract_features(TRAINING_CORPUS, tdir, strip_mode=args.strip_mode) if TEST_CORPUS is not None: vocab_path = fp.join(tdir, (fp.basename(TRAINING_CORPUS) + '.relations.sparse.vocab')) extract_features(TEST_CORPUS, tdir, vocab_path=vocab_path, strip_mode=args.strip_mode) with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) if not args.skip_training: latest_dir = latest_tmp() force_symlink(fp.basename(tdir), latest_dir)
def extract_features(corpus, output_dir, vocab_path=None, strip_mode=None): """Extract features for a corpus, dump the instances. Run feature extraction for a particular corpus; and store the results in the output directory. Output file name will be computed from the corpus file name. This triggers two distinct processes, for pairs of EDUs then for single EDUs. Parameters ---------- corpus: filepath Selected corpus output_dir: filepath Folder where instances will be dumped vocab_path: filepath Vocabulary to load for feature extraction (needed if extracting test data; must ensure we have the same vocab in test as we'd have in training) strip_mode: one of {'head', 'broadcast', 'custom'} Method to strip CDUs """ # TODO: perhaps we could just directly invoke the appropriate # educe module here instead of going through the command line? cmd = [ "stac-learning", "extract", corpus, LEX_DIR, output_dir, "--anno", ANNOTATORS ] if vocab_path is not None: cmd.extend(['--vocabulary', vocab_path]) if strip_mode is not None: cmd.extend(['--strip-mode', strip_mode]) call(cmd) call(cmd + ["--single"])
def _graph(lconf, log): "Visualise the parses" corpus_dir = minicorpus_path(lconf, result=True) cmd = ["stac-util", "graph", corpus_dir, "--output", corpus_dir] call(cmd, stderr=log)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ if args.skip_training: tdir = latest_tmp() else: tdir = current_tmp() extract_features(TRAINING_CORPUS, tdir, args.coarse, args.fix_pseudo_rels) if TEST_CORPUS is not None: train_path = fp.join(tdir, fp.basename(TRAINING_CORPUS)) label_path = train_path + '.relations.sparse' vocab_path = label_path + '.vocab' extract_features(TEST_CORPUS, tdir, args.coarse, args.fix_pseudo_rels, vocab_path=vocab_path, label_path=label_path) with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) if not args.skip_training: latest_dir = latest_tmp() force_symlink(fp.basename(tdir), latest_dir)
def extract_features(corpus, output_dir, vocab_path=None, label_path=None): """ Run feature extraction for a particular corpus; and store the results in the output directory. Output file name will be computed from the corpus file name :type: corpus: filepath :param: vocab_path: vocabulary to load for feature extraction (needed if extracting test data; must ensure we have the same vocab in test as we'd have in training) """ # TODO: perhaps we could just directly invoke the appropriate # educe module here instead of going through the command line? cmd = ["rst-dt-learning", "extract", corpus, PTB_DIR, output_dir, '--feature_set', FEATURE_SET] if vocab_path is not None: cmd.extend(['--vocabulary', vocab_path]) if label_path is not None: cmd.extend(['--labels', label_path]) call(cmd)
def extract_features(corpus, output_dir, coarse, fix_pseudo_rels, vocab_path=None, label_path=None): """Extract instances from a corpus, store them in files. Run feature extraction for a particular corpus and store the results in the output directory. Output file name will be computed from the corpus file name. Parameters ---------- corpus: filepath Path to the corpus. output_dir: filepath Path to the output folder. coarse: boolean, False by default Use coarse-grained relation labels. fix_pseudo_rels: boolean, False by default Rewrite pseudo-relations to improve consistency (WIP). vocab_path: filepath Path to a fixed vocabulary mapping, for feature extraction (needed if extracting test data: the same vocabulary should be used in train and test). label_path: filepath Path to a list of labels. """ # TODO: perhaps we could just directly invoke the appropriate # educe module here instead of going through the command line? cmd = [ "rst-dt-learning", "extract", corpus, PTB_DIR, # TODO make this optional and exclusive from CoreNLP output_dir, '--feature_set', FEATURE_SET, ] # NEW 2016-05-19 rewrite pseudo-relations if fix_pseudo_rels: cmd.extend([ '--fix_pseudo_rels' ]) # NEW 2016-05-03 use coarse- or fine-grained relation labels # NB "coarse" was the previous default if coarse: cmd.extend([ '--coarse' ]) if CORENLP_OUT_DIR is not None: cmd.extend([ '--corenlp_out_dir', CORENLP_OUT_DIR, ]) if LECSIE_DATA_DIR is not None: cmd.extend([ '--lecsie_data_dir', LECSIE_DATA_DIR, ]) if vocab_path is not None: cmd.extend(['--vocabulary', vocab_path]) if label_path is not None: cmd.extend(['--labels', label_path]) call(cmd)
def main(_): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ call(["rst-dt-learning", "features"])
def _feature_extraction(lconf, log): """ Extract features from our input glozz file """ corpus_dir = minicorpus_path(lconf) vocab_path = lconf.mpack_paths(test_data=False)[3] cmd = [ "stac-learning", "extract", "--parsing", "--vocab", vocab_path, corpus_dir, lconf.abspath(LEX_DIR), lconf.tmp_dir ] call(cmd, stderr=log)
def _resource_extraction(lconf, log): """ Using a previously predicted dialogue act model, guess dialogue acts for all the EDUs """ corpus_dir = minicorpus_path(lconf) cmd = ["stac-learning", "resource-nps", corpus_dir, lconf.abspath(LEX_DIR), "--output", resource_np_path(lconf)] call(cmd, stderr=log)
def _resource_extraction(lconf, log): """ Using a previously predicted dialogue act model, guess dialogue acts for all the EDUs """ corpus_dir = minicorpus_path(lconf) cmd = [ "stac-learning", "resource-nps", corpus_dir, lconf.abspath(LEX_DIR), "--output", resource_np_path(lconf) ] call(cmd, stderr=log)
def _feature_extraction(lconf, log): """ Extract features from our input glozz file """ corpus_dir = minicorpus_path(lconf) vocab_path = lconf.mpack_paths(test_data=False)[3] cmd = ["stac-learning", "extract", "--parsing", "--vocab", vocab_path, corpus_dir, lconf.abspath(LEX_DIR), lconf.tmp_dir] call(cmd, stderr=log)
def main(_): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ tdir = current_tmp() call(["rst-dt-learning", "extract", TRAINING_CORPUS, PTB_DIR, tdir, '--feature_set', FEATURE_SET]) with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) latest_dir = latest_tmp() force_symlink(os.path.basename(tdir), latest_dir)
def _create_snapshot_dir(data_dir): """ Instantiate a snapshot dir and return its path """ bname = fp.basename(os.readlink(data_dir)) snap_dir = fp.join(SNAPSHOTS, bname) if not fp.exists(snap_dir): os.makedirs(snap_dir) link_files(data_dir, snap_dir) force_symlink(bname, latest_snap()) with open(fp.join(snap_dir, "versions-model.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) return snap_dir
def _mk_report(parent_dir, lconf, idx_file): "Generate reports for scores" score_prefix = _score_file_path_prefix(parent_dir, lconf) json_file = score_prefix + ".json" pretty_file = score_prefix + ".txt" with open(pretty_file, "w") as pretty_stream: call(["attelo", "report", idx_file, "--json", json_file], stdout=pretty_stream) print("Scores summarised in %s" % pretty_file, file=sys.stderr)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpora = [TRAINING_CORPUS] odir = get_output_dir(args) for corpus in corpora: ofilename = fp.join(odir, fp.basename(corpus) + ".txt") with open(ofilename, 'w') as ofile: call(["stac-util", "count", corpus, "--annotator", ANNOTATORS], stdout=ofile) announce_output_dir(odir)
def prepare_dirs(runcfg, data_dir): """ Return eval and scatch directory paths """ eval_prefix = fp.join(data_dir, "eval") scratch_prefix = fp.join(data_dir, "scratch") eval_current = eval_prefix + '-current' scratch_current = scratch_prefix + '-current' stage = runcfg.stage if (runcfg.mode == 'resume' or stage in [ClusterStage.main, ClusterStage.combined_models, ClusterStage.end]): if not fp.exists(eval_current) or not fp.exists(scratch_current): sys.exit("No currently running evaluation to resume!") else: eval_dir = fp.realpath(eval_current) scratch_dir = fp.realpath(scratch_current) # in case there are any new data files to link _link_data_files(data_dir, eval_dir) return eval_dir, scratch_dir else: eval_actual_old = fp.realpath(eval_current) scratch_actual_old = fp.realpath(scratch_current) tstamp = timestamp() if _create_tstamped_dir(eval_prefix, tstamp): eval_dir = eval_prefix + '-' + tstamp scratch_dir = scratch_prefix + '-' + tstamp _create_tstamped_dir(scratch_prefix, tstamp) _link_data_files(data_dir, eval_dir) if runcfg.stage == 'jumpstart': _link_fold_files(eval_actual_old, eval_dir) _link_model_files(scratch_actual_old, scratch_dir) else: sys.exit("Try again in one minute") with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) return eval_dir, scratch_dir
def prepare_dirs(runcfg, data_dir): """ Return eval and scatch directory paths """ eval_prefix = fp.join(data_dir, "eval") scratch_prefix = fp.join(data_dir, "scratch") eval_current = eval_prefix + '-current' scratch_current = scratch_prefix + '-current' stage = runcfg.stage if (runcfg.mode == 'resume' or stage in [ ClusterStage.main, ClusterStage.combined_models, ClusterStage.end ]): if not fp.exists(eval_current) or not fp.exists(scratch_current): sys.exit("No currently running evaluation to resume!") else: eval_dir = fp.realpath(eval_current) scratch_dir = fp.realpath(scratch_current) # in case there are any new data files to link _link_data_files(data_dir, eval_dir) return eval_dir, scratch_dir else: eval_actual_old = fp.realpath(eval_current) scratch_actual_old = fp.realpath(scratch_current) tstamp = timestamp() if _create_tstamped_dir(eval_prefix, tstamp): eval_dir = eval_prefix + '-' + tstamp scratch_dir = scratch_prefix + '-' + tstamp _create_tstamped_dir(scratch_prefix, tstamp) _link_data_files(data_dir, eval_dir) if runcfg.stage == 'jumpstart': _link_fold_files(eval_actual_old, eval_dir) _link_model_files(scratch_actual_old, scratch_dir) else: sys.exit("Try again in one minute") with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) return eval_dir, scratch_dir
def _create_eval_dirs(args, data_dir, jumpstart): """ Return eval and scatch directory paths """ eval_current = fp.join(data_dir, "eval-current") scratch_current = fp.join(data_dir, "scratch-current") stage = args_to_stage(args) if args.resume or stage in [ClusterStage.main, ClusterStage.combined_models, ClusterStage.end]: if not fp.exists(eval_current) or not fp.exists(scratch_current): sys.exit("No currently running evaluation to resume!") else: return eval_current, scratch_current else: tstamp = "TEST" if _DEBUG else timestamp() eval_dir = fp.join(data_dir, "eval-" + tstamp) if not fp.exists(eval_dir): os.makedirs(eval_dir) _link_data_files(data_dir, eval_dir) force_symlink(fp.basename(eval_dir), eval_current) elif not _DEBUG: sys.exit("Try again in one minute") scratch_dir = fp.join(data_dir, "scratch-" + tstamp) if not fp.exists(scratch_dir): os.makedirs(scratch_dir) if jumpstart: _link_model_files(scratch_current, scratch_dir) force_symlink(fp.basename(scratch_dir), scratch_current) with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) return eval_dir, scratch_dir
def extract_features(corpus, output_dir, vocab_path=None, strip_mode=None): """Extract features for a corpus, dump the instances. Run feature extraction for a particular corpus; and store the results in the output directory. Output file name will be computed from the corpus file name. This triggers two distinct processes, for pairs of EDUs then for single EDUs. Parameters ---------- corpus: filepath Selected corpus output_dir: filepath Folder where instances will be dumped vocab_path: filepath Vocabulary to load for feature extraction (needed if extracting test data; must ensure we have the same vocab in test as we'd have in training) strip_mode: one of {'head', 'broadcast', 'custom'} Method to strip CDUs """ # TODO: perhaps we could just directly invoke the appropriate # educe module here instead of going through the command line? cmd = ["stac-learning", "extract", corpus, LEX_DIR, output_dir, "--anno", ANNOTATORS] if vocab_path is not None: cmd.extend(['--vocabulary', vocab_path]) if strip_mode is not None: cmd.extend(['--strip-mode', strip_mode]) call(cmd) call(cmd + ["--single"])
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ data_dir = latest_tmp() if not os.path.exists(data_dir): _exit_ungathered() eval_dir, scratch_dir = _create_eval_dirs(args, data_dir) with open(os.path.join(eval_dir, "versions.txt"), "w") as stream: call(["pip", "freeze"], stdout=stream) for corpus in TRAINING_CORPORA: dataset = os.path.basename(corpus) fold_file = os.path.join(eval_dir, "folds-%s.json" % dataset) lconf = LoopConfig(eval_dir=eval_dir, scratch_dir=scratch_dir, fold_file=fold_file, dataset=dataset) _do_corpus(lconf)
def pyt(self, script, *args, **kwargs): "call python on one of our scripts" abs_script = self.abspath(script) cmd = ["python", abs_script] + list(args) call(cmd, **kwargs)