コード例 #1
0
def _format_decoder_output(lconf, log):
    """
    Convert decoder output to Glozz (for visualisation really)
    and copy it to resultcorpus
    """
    makedirs(minicorpus_doc_path(lconf, result=True))
    # unannotated
    force_symlink(unannotated_dir_path(lconf),
                  unannotated_dir_path(lconf, result=True))

    # parsed, postagged
    for section in ["parsed", "pos-tagged"]:
        force_symlink(minicorpus_stage_path(lconf, section),
                      minicorpus_stage_path(lconf, section, result=True))

    for econf in lconf.evaluations:
        # units/foo
        src_units_dir = minicorpus_stage_path(lconf, "units")
        tgt_units_dir = minicorpus_stage_path(lconf, "units", result=True)
        makedirs(tgt_units_dir)
        force_symlink(fp.join(src_units_dir, 'simple-da'),
                      fp.join(tgt_units_dir, parsed_bname(lconf, econf)))

        # discourse
        lconf.pyt("parser/parse-to-glozz",
                  minicorpus_path(lconf),
                  attelo_result_path(lconf, econf),
                  minicorpus_path(lconf, result=True),
                  stderr=log)
コード例 #2
0
ファイル: decode.py プロジェクト: padenis/irit-rst-dt
def delayed_decode(lconf, dconf, econf, fold):
    """
    Return possible futures for decoding groups within
    this model/decoder combo for the given fold
    """
    if _say_if_decoded(lconf, econf, fold, stage='decoding'):
        return []

    output_path = decode_output_path(lconf, econf, fold)
    makedirs(fp.dirname(output_path))

    subpack = dconf.pack.testing(dconf.folds, fold)
    doc_model_paths = attelo_doc_model_paths(lconf, econf.learner, fold)
    intra_flag = econf.settings.intra
    if intra_flag is not None:
        sent_model_paths =\
            attelo_sent_model_paths(lconf, econf.learner, fold)

        intra_model = Team('oracle', 'oracle')\
            if intra_flag.intra_oracle\
            else sent_model_paths.fmap(load_model)
        inter_model = Team('oracle', 'oracle')\
            if intra_flag.inter_oracle\
            else doc_model_paths.fmap(load_model)

        models = IntraInterPair(intra=intra_model,
                                inter=inter_model)
    else:
        models = doc_model_paths.fmap(load_model)

    return ath_decode.jobs(subpack, models,
                           econf.decoder.payload,
                           econf.settings.mode,
                           output_path)
コード例 #3
0
def _mk_hashfile(hconf, dconf, test_data):
    "Hash the features and models files for long term archiving"

    hash_me = list(hconf.mpack_paths(False))
    if hconf.test_evaluation is not None:
        hash_me.extend(hconf.mpack_paths(True))
    learners = frozenset(e.learner for e in hconf.evaluations)
    for rconf in learners:
        mpaths = hconf.model_paths(rconf, None)
        hash_me.extend(mpaths.values())
    if not test_data:
        for fold in sorted(frozenset(dconf.folds.values())):
            for rconf in learners:
                mpaths = hconf.model_paths(rconf, fold)
                hash_me.extend(mpaths.values())
    provenance_dir = fp.join(hconf.report_dir_path(test_data, None),
                             'provenance')
    makedirs(provenance_dir)
    with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream:
        for path in hash_me:
            if not fp.exists(path):
                continue
            fold_basename = fp.basename(fp.dirname(path))
            if fold_basename.startswith('fold-'):
                nice_path = fp.join(fold_basename, fp.basename(path))
            else:
                nice_path = fp.basename(path)
            print('\t'.join([nice_path, md5sum_file(path)]),
                  file=stream)
コード例 #4
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _format_decoder_output(lconf, log):
    """
    Convert decoder output to Glozz (for visualisation really)
    and copy it to resultcorpus
    """
    makedirs(minicorpus_doc_path(lconf, result=True))
    # unannotated
    force_symlink(unannotated_dir_path(lconf),
                  unannotated_dir_path(lconf, result=True))

    # parsed, postagged
    for section in ["parsed", "pos-tagged"]:
        force_symlink(minicorpus_stage_path(lconf, section),
                      minicorpus_stage_path(lconf, section,
                                            result=True))

    for econf in lconf.evaluations:
        # units/foo
        src_units_dir = minicorpus_stage_path(lconf, "units")
        tgt_units_dir = minicorpus_stage_path(lconf, "units",
                                              result=True)
        makedirs(tgt_units_dir)
        force_symlink(fp.join(src_units_dir, 'simple-da'),
                      fp.join(tgt_units_dir, parsed_bname(lconf, econf)))

        # discourse
        lconf.pyt("parser/parse-to-glozz",
                  minicorpus_path(lconf),
                  attelo_result_path(lconf, econf),
                  minicorpus_path(lconf, result=True),
                  stderr=log)
コード例 #5
0
ファイル: report.py プロジェクト: padenis/irit-rst-dt
def _copy_version_files(lconf):
    "Hash the features and models files for long term archiving"
    provenance_dir = fp.join(report_dir_path(lconf, None),
                             'provenance')
    makedirs(provenance_dir)
    for vpath in glob.glob(fp.join(lconf.eval_dir,
                                   'versions-*.txt')):
        shutil.copy(vpath, provenance_dir)
コード例 #6
0
ファイル: report.py プロジェクト: moreymat/attelo
def _copy_version_files(hconf, test_data):
    "Hash the features and models files for long term archiving"
    provenance_dir = fp.join(hconf.report_dir_path(test_data, None),
                             'provenance')
    makedirs(provenance_dir)
    for vpath in glob.glob(fp.join(hconf.eval_dir, 'versions-*.txt')):
        shutil.copy(vpath, provenance_dir)
    for cpath in hconf.config_files:
        shutil.copy(cpath, provenance_dir)
コード例 #7
0
ファイル: pipeline.py プロジェクト: popescuv/irit-stac
def _get_decoding_jobs(mpack, lconf, econf):
    """
    Run the decoder on a single config and convert the output
    """
    makedirs(lconf.tmp("parsed"))
    output_path = attelo_result_path(lconf, econf)
    cache = lconf.model_paths(econf.learner, None, econf.parser)
    parser = econf.parser.payload
    parser.fit([], [], cache=cache)  # we assume everything is cached
    return ath_parse.jobs(mpack, parser, output_path)
コード例 #8
0
ファイル: report.py プロジェクト: eipiplusun/attelo
 def dump(self, output_dir, header=None):
     """
     Save reports to an output directory
     """
     makedirs(output_dir)
     fnames = self._filenames(output_dir).values()
     # touch every file
     for fname in fnames:
         with open(fname, 'w'):
             pass
     self.append(output_dir, header=header)
コード例 #9
0
 def dump(self, output_dir, header=None):
     """
     Save reports to an output directory
     """
     makedirs(output_dir)
     fnames = self._filenames(output_dir).values()
     # touch every file
     for fname in fnames:
         with open(fname, 'w'):
             pass
     self.append(output_dir, header=header)
コード例 #10
0
ファイル: pipeline.py プロジェクト: eipiplusun/irit-stac
def _get_decoding_jobs(mpack, lconf, econf):
    """
    Run the decoder on a single config and convert the output
    """
    makedirs(lconf.tmp("parsed"))
    output_path = attelo_result_path(lconf, econf)
    cache = lconf.model_paths(econf.learner,
                              None)
    parser = econf.parser.payload
    parser.fit([], [], cache=cache)  # we assume everything is cached
    return ath_parse.jobs(mpack, parser, output_path)
コード例 #11
0
def _mk_server_temp(args):
    """
    Create a temporary directory to save intermediary parser files
    in (may be specified from args but defaults to some mktemp recipe)
    """
    if args.tmpdir is None:
        tmp_dir = fp.join(tempfile.mkdtemp(prefix="stac"))
    else:
        tmp_dir = args.tmpdir
    makedirs(tmp_dir)
    return tmp_dir
コード例 #12
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _copy_results(lconf, output_dir):
    "copy interesting files from tmp dir to output dir"

    # copy the csv parses
    parsed_results = fp.join(output_dir, "parsed")
    if fp.exists(parsed_results):
        shutil.rmtree(parsed_results)
    shutil.copytree(lconf.tmp("parsed"), parsed_results)

    # copy the svg graphs into single flat dir
    graphs_dir = fp.join(output_dir, "graphs")
    makedirs(graphs_dir)
    svg_files = sh.find(minicorpus_path(lconf, result=True),
                        "-name", "*.svg", _iter=True)
    for svg in (f.strip() for f in svg_files):
        svg2 = fp.join(graphs_dir,
                       fp.basename(fp.dirname(svg)) + ".svg")
        shutil.copyfile(svg, svg2)
コード例 #13
0
ファイル: pipeline.py プロジェクト: popescuv/irit-stac
def run_pipeline(lconf, stages):
    """
    Run each of the stages of the pipeline in succession. ::

        (LoopConfig, [Stage]) -> IO ()

    They don't feed into each other (yet); communication between stages is
    based on assumed side effects (ie. writing into files at conventional
    locations).
    """
    logdir = lconf.tmp("logs")
    makedirs(logdir)
    for stage in stages:
        msg = stage.description
        logpath = fp.join(logdir, stage.logname + ".txt")
        with stac_msg(msg or "", quiet=msg is None):
            with open(logpath, 'w') as log:
                stage.function(lconf, log)
コード例 #14
0
ファイル: pipeline.py プロジェクト: eipiplusun/irit-stac
def run_pipeline(lconf, stages):
    """
    Run each of the stages of the pipeline in succession. ::

        (LoopConfig, [Stage]) -> IO ()

    They don't feed into each other (yet); communication between stages is
    based on assumed side effects (ie. writing into files at conventional
    locations).
    """
    logdir = lconf.tmp("logs")
    makedirs(logdir)
    for stage in stages:
        msg = stage.description
        logpath = fp.join(logdir, stage.logname + ".txt")
        with stac_msg(msg or "", quiet=msg is None):
            with open(logpath, 'w') as log:
                stage.function(lconf, log)
コード例 #15
0
def _copy_results(lconf, output_dir):
    "copy interesting files from tmp dir to output dir"

    # copy the csv parses
    parsed_results = fp.join(output_dir, "parsed")
    if fp.exists(parsed_results):
        shutil.rmtree(parsed_results)
    shutil.copytree(lconf.tmp("parsed"), parsed_results)

    # copy the svg graphs into single flat dir
    graphs_dir = fp.join(output_dir, "graphs")
    makedirs(graphs_dir)
    svg_files = sh.find(minicorpus_path(lconf, result=True),
                        "-name",
                        "*.svg",
                        _iter=True)
    for svg in (f.strip() for f in svg_files):
        svg2 = fp.join(graphs_dir, fp.basename(fp.dirname(svg)) + ".svg")
        shutil.copyfile(svg, svg2)
コード例 #16
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _segmented_to_glozz(lconf, log):
    """
    convert the segmented CSV into glozz format
    (because that's what some of our other tools
    expectd)

    Return a corpus directory
    """
    lconf.pyt("intake/csvtoglozz.py",
              "-f", seg_path(lconf),
              "--start", "1000",
              stderr=log,
              cwd=lconf.tmp_dir)

    their_stub = lconf.tmp('segmented')
    unanno_stub = unannotated_stub_path(lconf)
    makedirs(fp.dirname(unanno_stub))
    os.rename(their_stub + '.aa', unanno_stub + '.aa')
    os.rename(their_stub + '.ac', unanno_stub + '.ac')
コード例 #17
0
ファイル: parse.py プロジェクト: popescuv/irit-stac
def _segmented_to_glozz(lconf, log):
    """
    convert the segmented CSV into glozz format
    (because that's what some of our other tools
    expectd)

    Return a corpus directory
    """
    lconf.pyt("intake/csvtoglozz.py",
              "-f", seg_path(lconf),
              "--start", "1000",
              stderr=log,
              cwd=lconf.tmp_dir)

    their_stub = lconf.tmp('segmented')
    unanno_stub = unannotated_stub_path(lconf)
    makedirs(fp.dirname(unanno_stub))
    os.rename(their_stub + '.aa', unanno_stub + '.aa')
    os.rename(their_stub + '.ac', unanno_stub + '.ac')
コード例 #18
0
ファイル: parse.py プロジェクト: kowey/attelo
def delayed_decode(hconf, dconf, econf, fold):
    """
    Return possible futures for decoding groups within
    this model/decoder combo for the given fold
    """
    if fold is None and hconf.test_evaluation is None:
        return []
    if _say_if_decoded(hconf, econf, fold, stage='decoding'):
        return []

    output_path = hconf.decode_output_path(econf, fold)
    makedirs(fp.dirname(output_path))

    if fold is None:
        subpack = dconf.pack
    else:
        subpack = select_testing(dconf.pack, dconf.folds, fold)

    parser = econf.parser.payload
    return jobs(subpack, parser, output_path)
コード例 #19
0
ファイル: report.py プロジェクト: padenis/irit-rst-dt
def _mk_hashfile(lconf, dconf):
    "Hash the features and models files for long term archiving"

    hash_me = [features_path(lconf)]
    for fold in sorted(frozenset(dconf.folds.values())):
        for rconf in LEARNERS:
            models_path = eval_model_path(lconf, rconf, fold, '*')
            hash_me.extend(sorted(glob.glob(models_path + '*')))
    provenance_dir = fp.join(report_dir_path(lconf, None),
                             'provenance')
    makedirs(provenance_dir)
    with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream:
        for path in hash_me:
            fold_basename = fp.basename(fp.dirname(path))
            if fold_basename.startswith('fold-'):
                nice_path = fp.join(fold_basename, fp.basename(path))
            else:
                nice_path = fp.basename(path)
            print('\t'.join([nice_path, md5sum_file(path)]),
                  file=stream)
コード例 #20
0
ファイル: parse.py プロジェクト: kowey/attelo
def delayed_decode(hconf, dconf, econf, fold):
    """
    Return possible futures for decoding groups within
    this model/decoder combo for the given fold
    """
    if fold is None and hconf.test_evaluation is None:
        return []
    if _say_if_decoded(hconf, econf, fold, stage='decoding'):
        return []

    output_path = hconf.decode_output_path(econf, fold)
    makedirs(fp.dirname(output_path))

    if fold is None:
        subpack = dconf.pack
    else:
        subpack = select_testing(dconf.pack, dconf.folds, fold)

    parser = econf.parser.payload
    return jobs(subpack, parser, output_path)
コード例 #21
0
ファイル: report.py プロジェクト: eipiplusun/attelo
def _mk_hashfile(hconf, dconf, test_data):
    "Hash the features and models files for long term archiving"
    # data files
    hash_me = list(hconf.mpack_paths(False))
    if hconf.test_evaluation is not None:
        hash_me.extend(hconf.mpack_paths(True))

    # model files
    model_files = []
    for econf in hconf.evaluations:
        mpaths = hconf.model_paths(econf.learner, None, econf.parser)
        model_files.extend(mpaths.values())
    if not test_data:
        for fold in sorted(frozenset(dconf.folds.values())):
            for econf in hconf.evaluations:
                mpaths = hconf.model_paths(econf.learner, fold, econf.parser)
                model_files.extend(mpaths.values())
    hash_me.extend(set(model_files))

    # then hash the collected files and dump the result to a file
    provenance_dir = fp.join(hconf.report_dir_path(test_data, None),
                             'provenance')
    makedirs(provenance_dir)
    with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream:
        for path in hash_me:
            if not fp.exists(path):
                continue
            fold_basename = fp.basename(fp.dirname(path))
            if fold_basename.startswith('fold-'):
                nice_path = fp.join(fold_basename, fp.basename(path))
            else:
                nice_path = fp.basename(path)
            # get md5 sum of file or (NEW) dir
            if fp.isfile(path):
                path_hash = md5sum_file(path)
            elif fp.isdir(path):
                path_hash = md5sum_dir(path)
            else:
                raise ValueError("Unhashable stuff: {}".format(path))
            print('\t'.join([nice_path, path_hash]),
                  file=stream)
コード例 #22
0
ファイル: report.py プロジェクト: moreymat/attelo
    def dump(self, output_dir, header=None, digits=3):
        """Save reports to an output directory.

        Parameters
        ----------
        output_dir : str
            Output folder.

        header : str, optional
            Additional header text to display.

        digits : int, defaults to 3
            Number of digits for formatting output floating point values.
        """
        makedirs(output_dir)
        fnames = self._filenames(output_dir).values()
        # touch every file
        for fname in fnames:
            with open(fname, 'w'):
                pass
        self.append(output_dir, header, digits=digits)
コード例 #23
0
ファイル: report.py プロジェクト: moreymat/attelo
    def _filenames(self, output_dir):
        """
        Return a dictionary of filenames; keys are internal
        to this class
        """
        makedirs(output_dir)
        filenames = {}

        if self.edge is not None:
            # edgewise scores
            filenames['edge'] = fp.join(output_dir, 'scores.txt')

        if self.cspan is not None:
            filenames['cspan'] = fp.join(output_dir, 'cscores.txt')

        if self.edu is not None:
            # edu scores
            filenames['edu'] = fp.join(output_dir, 'edu-scores.txt')

        if self.edge_by_label is not None:
            label_dir = fp.join(output_dir, 'label-scores')
            makedirs(label_dir)
            for key in self.edge_by_label:
                fkey = ('label', key)
                filenames[fkey] = fp.join(label_dir, '-'.join(key))

        if self.confusion is not None:
            confusion_dir = fp.join(output_dir, 'confusion')
            makedirs(confusion_dir)
            for key in self.confusion:
                fkey = ('confusion', key)
                filenames[fkey] = fp.join(confusion_dir, '-'.join(key))
        return filenames
コード例 #24
0
ファイル: report.py プロジェクト: moreymat/attelo
def _mk_hashfile(hconf, dconf, test_data):
    "Hash the features and models files for long term archiving"
    # data files
    hash_me = [v for k, v in sorted(hconf.mpack_paths(False).items())]
    if hconf.test_evaluation is not None:
        hash_me.extend([v for k, v in sorted(hconf.mpack_paths(True).items())])

    # model files
    model_files = []
    for econf in hconf.evaluations:
        mpaths = hconf.model_paths(econf.learner, None, econf.parser)
        model_files.extend(mpaths.values())
    if not test_data:
        for fold in sorted(frozenset(dconf.folds.values())):
            for econf in hconf.evaluations:
                mpaths = hconf.model_paths(econf.learner, fold, econf.parser)
                model_files.extend(mpaths.values())
    hash_me.extend(set(model_files))

    # then hash the collected files and dump the result to a file
    provenance_dir = fp.join(hconf.report_dir_path(test_data, None),
                             'provenance')
    makedirs(provenance_dir)
    with open(fp.join(provenance_dir, 'hashes.txt'), 'w') as stream:
        for path in hash_me:
            if not fp.exists(path):
                continue
            fold_basename = fp.basename(fp.dirname(path))
            if fold_basename.startswith('fold-'):
                nice_path = fp.join(fold_basename, fp.basename(path))
            else:
                nice_path = fp.basename(path)
            # get md5 sum of file or (NEW) dir
            if fp.isfile(path):
                path_hash = md5sum_file(path)
            elif fp.isdir(path):
                path_hash = md5sum_dir(path)
            else:
                raise ValueError("Unhashable stuff: {}".format(path))
            print('\t'.join([nice_path, path_hash]), file=stream)
コード例 #25
0
ファイル: report.py プロジェクト: eipiplusun/attelo
    def _filenames(self, output_dir):
        """
        Return a dictionary of filenames; keys are internal
        to this class
        """
        makedirs(output_dir)
        filenames = {}

        if self.edge is not None:
            # edgewise scores
            filenames['edge'] = fp.join(output_dir, 'scores.txt')

        if self.cspan is not None:
            filenames['cspan'] = fp.join(output_dir, 'cscores.txt')

        if self.edu is not None:
            # edu scores
            filenames['edu'] = fp.join(output_dir, 'edu-scores.txt')

        if self.edge_by_label is not None:
            label_dir = fp.join(output_dir, 'label-scores')
            makedirs(label_dir)
            for key in self.edge_by_label:
                fkey = ('label', key)
                filenames[fkey] = fp.join(label_dir, '-'.join(key))

        if self.confusion is not None:
            confusion_dir = fp.join(output_dir, 'confusion')
            makedirs(confusion_dir)
            for key in self.confusion:
                fkey = ('confusion', key)
                filenames[fkey] = fp.join(confusion_dir, '-'.join(key))
        return filenames
コード例 #26
0
ファイル: parse.py プロジェクト: popescuv/irit-stac
def _copy_results(lconf, output_dir):
    "copy interesting files from tmp dir to output dir"

    # copy the csv parses
    parsed_results = fp.join(output_dir, "parsed")
    if fp.exists(parsed_results):
        shutil.rmtree(parsed_results)
    shutil.copytree(lconf.tmp("parsed"), parsed_results)

    # copy the svg graphs into single flat dir
    graphs_dir = fp.join(output_dir, "graphs")
    makedirs(graphs_dir)
    # svg_files = sh.find(minicorpus_path(lconf, result=True),
    #                     "-name", "*.svg", _iter=True)
    base_svg_dir = minicorpus_path(lconf, result=True)  # DEBUG
    svg_files = [os.path.join(dirpath, fname)
                 for dirpath, dirs, files in os.walk(base_svg_dir)
                 for fname in (dirs + files)
                 if fname.endswith('.svg')]
    for svg in (f.strip() for f in svg_files):
        svg2 = fp.join(graphs_dir,
                       fp.basename(fp.dirname(svg)) + ".svg")
        shutil.copyfile(svg, svg2)