def _format_decoder_output(lconf, log): """ Convert decoder output to Glozz (for visualisation really) and copy it to resultcorpus """ makedirs(minicorpus_doc_path(lconf, result=True)) # unannotated force_symlink(unannotated_dir_path(lconf), unannotated_dir_path(lconf, result=True)) # parsed, postagged for section in ["parsed", "pos-tagged"]: force_symlink(minicorpus_stage_path(lconf, section), minicorpus_stage_path(lconf, section, result=True)) for econf in lconf.evaluations: # units/foo src_units_dir = minicorpus_stage_path(lconf, "units") tgt_units_dir = minicorpus_stage_path(lconf, "units", result=True) makedirs(tgt_units_dir) force_symlink(fp.join(src_units_dir, 'simple-da'), fp.join(tgt_units_dir, parsed_bname(lconf, econf))) # discourse lconf.pyt("parser/parse-to-glozz", minicorpus_path(lconf), attelo_result_path(lconf, econf), minicorpus_path(lconf, result=True), stderr=log)
def delayed_decode(lconf, dconf, econf, fold): """ Return possible futures for decoding groups within this model/decoder combo for the given fold """ if _say_if_decoded(lconf, econf, fold, stage='decoding'): return [] output_path = decode_output_path(lconf, econf, fold) makedirs(fp.dirname(output_path)) subpack = dconf.pack.testing(dconf.folds, fold) doc_model_paths = attelo_doc_model_paths(lconf, econf.learner, fold) intra_flag = econf.settings.intra if intra_flag is not None: sent_model_paths =\ attelo_sent_model_paths(lconf, econf.learner, fold) intra_model = Team('oracle', 'oracle')\ if intra_flag.intra_oracle\ else sent_model_paths.fmap(load_model) inter_model = Team('oracle', 'oracle')\ if intra_flag.inter_oracle\ else doc_model_paths.fmap(load_model) models = IntraInterPair(intra=intra_model, inter=inter_model) else: models = doc_model_paths.fmap(load_model) return ath_decode.jobs(subpack, models, econf.decoder.payload, econf.settings.mode, output_path)
def _mk_hashfile(hconf, dconf, test_data): "Hash the features and models files for long term archiving" hash_me = list(hconf.mpack_paths(False)) if hconf.test_evaluation is not None: hash_me.extend(hconf.mpack_paths(True)) learners = frozenset(e.learner for e in hconf.evaluations) for rconf in learners: mpaths = hconf.model_paths(rconf, None) hash_me.extend(mpaths.values()) if not test_data: for fold in sorted(frozenset(dconf.folds.values())): for rconf in learners: mpaths = hconf.model_paths(rconf, fold) hash_me.extend(mpaths.values()) provenance_dir = fp.join(hconf.report_dir_path(test_data, None), 'provenance') makedirs(provenance_dir) with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream: for path in hash_me: if not fp.exists(path): continue fold_basename = fp.basename(fp.dirname(path)) if fold_basename.startswith('fold-'): nice_path = fp.join(fold_basename, fp.basename(path)) else: nice_path = fp.basename(path) print('\t'.join([nice_path, md5sum_file(path)]), file=stream)
def _copy_version_files(lconf): "Hash the features and models files for long term archiving" provenance_dir = fp.join(report_dir_path(lconf, None), 'provenance') makedirs(provenance_dir) for vpath in glob.glob(fp.join(lconf.eval_dir, 'versions-*.txt')): shutil.copy(vpath, provenance_dir)
def _copy_version_files(hconf, test_data): "Hash the features and models files for long term archiving" provenance_dir = fp.join(hconf.report_dir_path(test_data, None), 'provenance') makedirs(provenance_dir) for vpath in glob.glob(fp.join(hconf.eval_dir, 'versions-*.txt')): shutil.copy(vpath, provenance_dir) for cpath in hconf.config_files: shutil.copy(cpath, provenance_dir)
def _get_decoding_jobs(mpack, lconf, econf): """ Run the decoder on a single config and convert the output """ makedirs(lconf.tmp("parsed")) output_path = attelo_result_path(lconf, econf) cache = lconf.model_paths(econf.learner, None, econf.parser) parser = econf.parser.payload parser.fit([], [], cache=cache) # we assume everything is cached return ath_parse.jobs(mpack, parser, output_path)
def dump(self, output_dir, header=None): """ Save reports to an output directory """ makedirs(output_dir) fnames = self._filenames(output_dir).values() # touch every file for fname in fnames: with open(fname, 'w'): pass self.append(output_dir, header=header)
def _get_decoding_jobs(mpack, lconf, econf): """ Run the decoder on a single config and convert the output """ makedirs(lconf.tmp("parsed")) output_path = attelo_result_path(lconf, econf) cache = lconf.model_paths(econf.learner, None) parser = econf.parser.payload parser.fit([], [], cache=cache) # we assume everything is cached return ath_parse.jobs(mpack, parser, output_path)
def _mk_server_temp(args): """ Create a temporary directory to save intermediary parser files in (may be specified from args but defaults to some mktemp recipe) """ if args.tmpdir is None: tmp_dir = fp.join(tempfile.mkdtemp(prefix="stac")) else: tmp_dir = args.tmpdir makedirs(tmp_dir) return tmp_dir
def _copy_results(lconf, output_dir): "copy interesting files from tmp dir to output dir" # copy the csv parses parsed_results = fp.join(output_dir, "parsed") if fp.exists(parsed_results): shutil.rmtree(parsed_results) shutil.copytree(lconf.tmp("parsed"), parsed_results) # copy the svg graphs into single flat dir graphs_dir = fp.join(output_dir, "graphs") makedirs(graphs_dir) svg_files = sh.find(minicorpus_path(lconf, result=True), "-name", "*.svg", _iter=True) for svg in (f.strip() for f in svg_files): svg2 = fp.join(graphs_dir, fp.basename(fp.dirname(svg)) + ".svg") shutil.copyfile(svg, svg2)
def run_pipeline(lconf, stages): """ Run each of the stages of the pipeline in succession. :: (LoopConfig, [Stage]) -> IO () They don't feed into each other (yet); communication between stages is based on assumed side effects (ie. writing into files at conventional locations). """ logdir = lconf.tmp("logs") makedirs(logdir) for stage in stages: msg = stage.description logpath = fp.join(logdir, stage.logname + ".txt") with stac_msg(msg or "", quiet=msg is None): with open(logpath, 'w') as log: stage.function(lconf, log)
def _segmented_to_glozz(lconf, log): """ convert the segmented CSV into glozz format (because that's what some of our other tools expectd) Return a corpus directory """ lconf.pyt("intake/csvtoglozz.py", "-f", seg_path(lconf), "--start", "1000", stderr=log, cwd=lconf.tmp_dir) their_stub = lconf.tmp('segmented') unanno_stub = unannotated_stub_path(lconf) makedirs(fp.dirname(unanno_stub)) os.rename(their_stub + '.aa', unanno_stub + '.aa') os.rename(their_stub + '.ac', unanno_stub + '.ac')
def delayed_decode(hconf, dconf, econf, fold): """ Return possible futures for decoding groups within this model/decoder combo for the given fold """ if fold is None and hconf.test_evaluation is None: return [] if _say_if_decoded(hconf, econf, fold, stage='decoding'): return [] output_path = hconf.decode_output_path(econf, fold) makedirs(fp.dirname(output_path)) if fold is None: subpack = dconf.pack else: subpack = select_testing(dconf.pack, dconf.folds, fold) parser = econf.parser.payload return jobs(subpack, parser, output_path)
def _mk_hashfile(lconf, dconf): "Hash the features and models files for long term archiving" hash_me = [features_path(lconf)] for fold in sorted(frozenset(dconf.folds.values())): for rconf in LEARNERS: models_path = eval_model_path(lconf, rconf, fold, '*') hash_me.extend(sorted(glob.glob(models_path + '*'))) provenance_dir = fp.join(report_dir_path(lconf, None), 'provenance') makedirs(provenance_dir) with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream: for path in hash_me: fold_basename = fp.basename(fp.dirname(path)) if fold_basename.startswith('fold-'): nice_path = fp.join(fold_basename, fp.basename(path)) else: nice_path = fp.basename(path) print('\t'.join([nice_path, md5sum_file(path)]), file=stream)
def _mk_hashfile(hconf, dconf, test_data): "Hash the features and models files for long term archiving" # data files hash_me = list(hconf.mpack_paths(False)) if hconf.test_evaluation is not None: hash_me.extend(hconf.mpack_paths(True)) # model files model_files = [] for econf in hconf.evaluations: mpaths = hconf.model_paths(econf.learner, None, econf.parser) model_files.extend(mpaths.values()) if not test_data: for fold in sorted(frozenset(dconf.folds.values())): for econf in hconf.evaluations: mpaths = hconf.model_paths(econf.learner, fold, econf.parser) model_files.extend(mpaths.values()) hash_me.extend(set(model_files)) # then hash the collected files and dump the result to a file provenance_dir = fp.join(hconf.report_dir_path(test_data, None), 'provenance') makedirs(provenance_dir) with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream: for path in hash_me: if not fp.exists(path): continue fold_basename = fp.basename(fp.dirname(path)) if fold_basename.startswith('fold-'): nice_path = fp.join(fold_basename, fp.basename(path)) else: nice_path = fp.basename(path) # get md5 sum of file or (NEW) dir if fp.isfile(path): path_hash = md5sum_file(path) elif fp.isdir(path): path_hash = md5sum_dir(path) else: raise ValueError("Unhashable stuff: {}".format(path)) print('\t'.join([nice_path, path_hash]), file=stream)
def dump(self, output_dir, header=None, digits=3): """Save reports to an output directory. Parameters ---------- output_dir : str Output folder. header : str, optional Additional header text to display. digits : int, defaults to 3 Number of digits for formatting output floating point values. """ makedirs(output_dir) fnames = self._filenames(output_dir).values() # touch every file for fname in fnames: with open(fname, 'w'): pass self.append(output_dir, header, digits=digits)
def _filenames(self, output_dir): """ Return a dictionary of filenames; keys are internal to this class """ makedirs(output_dir) filenames = {} if self.edge is not None: # edgewise scores filenames['edge'] = fp.join(output_dir, 'scores.txt') if self.cspan is not None: filenames['cspan'] = fp.join(output_dir, 'cscores.txt') if self.edu is not None: # edu scores filenames['edu'] = fp.join(output_dir, 'edu-scores.txt') if self.edge_by_label is not None: label_dir = fp.join(output_dir, 'label-scores') makedirs(label_dir) for key in self.edge_by_label: fkey = ('label', key) filenames[fkey] = fp.join(label_dir, '-'.join(key)) if self.confusion is not None: confusion_dir = fp.join(output_dir, 'confusion') makedirs(confusion_dir) for key in self.confusion: fkey = ('confusion', key) filenames[fkey] = fp.join(confusion_dir, '-'.join(key)) return filenames
def _mk_hashfile(hconf, dconf, test_data): "Hash the features and models files for long term archiving" # data files hash_me = [v for k, v in sorted(hconf.mpack_paths(False).items())] if hconf.test_evaluation is not None: hash_me.extend([v for k, v in sorted(hconf.mpack_paths(True).items())]) # model files model_files = [] for econf in hconf.evaluations: mpaths = hconf.model_paths(econf.learner, None, econf.parser) model_files.extend(mpaths.values()) if not test_data: for fold in sorted(frozenset(dconf.folds.values())): for econf in hconf.evaluations: mpaths = hconf.model_paths(econf.learner, fold, econf.parser) model_files.extend(mpaths.values()) hash_me.extend(set(model_files)) # then hash the collected files and dump the result to a file provenance_dir = fp.join(hconf.report_dir_path(test_data, None), 'provenance') makedirs(provenance_dir) with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream: for path in hash_me: if not fp.exists(path): continue fold_basename = fp.basename(fp.dirname(path)) if fold_basename.startswith('fold-'): nice_path = fp.join(fold_basename, fp.basename(path)) else: nice_path = fp.basename(path) # get md5 sum of file or (NEW) dir if fp.isfile(path): path_hash = md5sum_file(path) elif fp.isdir(path): path_hash = md5sum_dir(path) else: raise ValueError("Unhashable stuff: {}".format(path)) print('\t'.join([nice_path, path_hash]), file=stream)
def _copy_results(lconf, output_dir): "copy interesting files from tmp dir to output dir" # copy the csv parses parsed_results = fp.join(output_dir, "parsed") if fp.exists(parsed_results): shutil.rmtree(parsed_results) shutil.copytree(lconf.tmp("parsed"), parsed_results) # copy the svg graphs into single flat dir graphs_dir = fp.join(output_dir, "graphs") makedirs(graphs_dir) # svg_files = sh.find(minicorpus_path(lconf, result=True), # "-name", "*.svg", _iter=True) base_svg_dir = minicorpus_path(lconf, result=True) # DEBUG svg_files = [os.path.join(dirpath, fname) for dirpath, dirs, files in os.walk(base_svg_dir) for fname in (dirs + files) if fname.endswith('.svg')] for svg in (f.strip() for f in svg_files): svg2 = fp.join(graphs_dir, fp.basename(fp.dirname(svg)) + ".svg") shutil.copyfile(svg, svg2)