コード例 #1
0
ファイル: gather.py プロジェクト: eipiplusun/irit-stac
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.skip_training:
        tdir = latest_tmp()
    else:
        tdir = current_tmp()
        extract_features(TRAINING_CORPUS, tdir, strip_mode=args.strip_mode)

    if TEST_CORPUS is not None:
        vocab_path = fp.join(tdir,
                             (fp.basename(TRAINING_CORPUS) +
                              '.relations.sparse.vocab'))
        extract_features(TEST_CORPUS, tdir,
                         vocab_path=vocab_path,
                         strip_mode=args.strip_mode)

    with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)

    if not args.skip_training:
        latest_dir = latest_tmp()
        force_symlink(fp.basename(tdir), latest_dir)
コード例 #2
0
ファイル: gather.py プロジェクト: popescuv/irit-stac
def extract_features(corpus, output_dir, vocab_path=None, strip_mode=None):
    """Extract features for a corpus, dump the instances.

    Run feature extraction for a particular corpus; and store the
    results in the output directory. Output file name will be
    computed from the corpus file name.

    This triggers two distinct processes, for pairs of EDUs then for
    single EDUs.

    Parameters
    ----------
    corpus: filepath
        Selected corpus
    output_dir: filepath
        Folder where instances will be dumped
    vocab_path: filepath
        Vocabulary to load for feature extraction (needed if extracting
        test data; must ensure we have the same vocab in test as we'd
        have in training)
    strip_mode: one of {'head', 'broadcast', 'custom'}
        Method to strip CDUs
    """
    # TODO: perhaps we could just directly invoke the appropriate
    # educe module here instead of going through the command line?
    cmd = [
        "stac-learning", "extract", corpus, LEX_DIR, output_dir, "--anno",
        ANNOTATORS
    ]
    if vocab_path is not None:
        cmd.extend(['--vocabulary', vocab_path])
    if strip_mode is not None:
        cmd.extend(['--strip-mode', strip_mode])
    call(cmd)
    call(cmd + ["--single"])
コード例 #3
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _graph(lconf, log):
    "Visualise the parses"

    corpus_dir = minicorpus_path(lconf, result=True)
    cmd = ["stac-util", "graph", corpus_dir,
           "--output", corpus_dir]
    call(cmd, stderr=log)
コード例 #4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.skip_training:
        tdir = latest_tmp()
    else:
        tdir = current_tmp()
        extract_features(TRAINING_CORPUS, tdir, args.coarse,
                         args.fix_pseudo_rels)
    if TEST_CORPUS is not None:
        train_path = fp.join(tdir, fp.basename(TRAINING_CORPUS))
        label_path = train_path + '.relations.sparse'
        vocab_path = label_path + '.vocab'
        extract_features(TEST_CORPUS, tdir, args.coarse,
                         args.fix_pseudo_rels,
                         vocab_path=vocab_path,
                         label_path=label_path)
    with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)
    if not args.skip_training:
        latest_dir = latest_tmp()
        force_symlink(fp.basename(tdir), latest_dir)
コード例 #5
0
ファイル: gather.py プロジェクト: kowey/irit-rst-dt
def extract_features(corpus, output_dir,
                     vocab_path=None,
                     label_path=None):
    """
    Run feature extraction for a particular corpus; and store the
    results in the output directory. Output file name will be
    computed from the corpus file name

    :type: corpus: filepath

    :param: vocab_path: vocabulary to load for feature extraction
    (needed if extracting test data; must ensure we have the same
    vocab in test as we'd have in training)
    """
    # TODO: perhaps we could just directly invoke the appropriate
    # educe module here instead of going through the command line?
    cmd = ["rst-dt-learning", "extract",
           corpus,
           PTB_DIR,
           output_dir,
           '--feature_set', FEATURE_SET]
    if vocab_path is not None:
        cmd.extend(['--vocabulary', vocab_path])
    if label_path is not None:
        cmd.extend(['--labels', label_path])
    call(cmd)
コード例 #6
0
ファイル: parse.py プロジェクト: popescuv/irit-stac
def _graph(lconf, log):
    "Visualise the parses"

    corpus_dir = minicorpus_path(lconf, result=True)
    cmd = ["stac-util", "graph", corpus_dir,
           "--output", corpus_dir]
    call(cmd, stderr=log)
コード例 #7
0
def extract_features(corpus, output_dir, coarse, fix_pseudo_rels,
                     vocab_path=None,
                     label_path=None):
    """Extract instances from a corpus, store them in files.

    Run feature extraction for a particular corpus and store the
    results in the output directory. Output file name will be
    computed from the corpus file name.

    Parameters
    ----------
    corpus: filepath
        Path to the corpus.
    output_dir: filepath
        Path to the output folder.
    coarse: boolean, False by default
        Use coarse-grained relation labels.
    fix_pseudo_rels: boolean, False by default
        Rewrite pseudo-relations to improve consistency (WIP).
    vocab_path: filepath
        Path to a fixed vocabulary mapping, for feature extraction
        (needed if extracting test data: the same vocabulary should be
        used in train and test).
    label_path: filepath
        Path to a list of labels.
    """
    # TODO: perhaps we could just directly invoke the appropriate
    # educe module here instead of going through the command line?
    cmd = [
        "rst-dt-learning", "extract",
        corpus,
        PTB_DIR,  # TODO make this optional and exclusive from CoreNLP
        output_dir,
        '--feature_set', FEATURE_SET,
    ]
    # NEW 2016-05-19 rewrite pseudo-relations
    if fix_pseudo_rels:
        cmd.extend([
            '--fix_pseudo_rels'
        ])
    # NEW 2016-05-03 use coarse- or fine-grained relation labels
    # NB "coarse" was the previous default
    if coarse:
        cmd.extend([
            '--coarse'
        ])
    if CORENLP_OUT_DIR is not None:
        cmd.extend([
            '--corenlp_out_dir', CORENLP_OUT_DIR,
        ])
    if LECSIE_DATA_DIR is not None:
        cmd.extend([
            '--lecsie_data_dir', LECSIE_DATA_DIR,
        ])
    if vocab_path is not None:
        cmd.extend(['--vocabulary', vocab_path])
    if label_path is not None:
        cmd.extend(['--labels', label_path])
    call(cmd)
コード例 #8
0
ファイル: features.py プロジェクト: padenis/irit-rst-dt
def main(_):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    call(["rst-dt-learning", "features"])
コード例 #9
0
def _feature_extraction(lconf, log):
    """
    Extract features from our input glozz file
    """
    corpus_dir = minicorpus_path(lconf)
    vocab_path = lconf.mpack_paths(test_data=False)[3]
    cmd = [
        "stac-learning", "extract", "--parsing", "--vocab", vocab_path,
        corpus_dir,
        lconf.abspath(LEX_DIR), lconf.tmp_dir
    ]
    call(cmd, stderr=log)
コード例 #10
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _resource_extraction(lconf, log):
    """
    Using a previously predicted dialogue act model,
    guess dialogue acts for all the EDUs
    """
    corpus_dir = minicorpus_path(lconf)
    cmd = ["stac-learning", "resource-nps",
           corpus_dir,
           lconf.abspath(LEX_DIR),
           "--output",
           resource_np_path(lconf)]
    call(cmd, stderr=log)
コード例 #11
0
def _resource_extraction(lconf, log):
    """
    Using a previously predicted dialogue act model,
    guess dialogue acts for all the EDUs
    """
    corpus_dir = minicorpus_path(lconf)
    cmd = [
        "stac-learning", "resource-nps", corpus_dir,
        lconf.abspath(LEX_DIR), "--output",
        resource_np_path(lconf)
    ]
    call(cmd, stderr=log)
コード例 #12
0
ファイル: parse.py プロジェクト: eipiplusun/irit-stac
def _feature_extraction(lconf, log):
    """
    Extract features from our input glozz file
    """
    corpus_dir = minicorpus_path(lconf)
    vocab_path = lconf.mpack_paths(test_data=False)[3]
    cmd = ["stac-learning", "extract",
           "--parsing",
           "--vocab", vocab_path,
           corpus_dir,
           lconf.abspath(LEX_DIR),
           lconf.tmp_dir]
    call(cmd, stderr=log)
コード例 #13
0
ファイル: gather.py プロジェクト: padenis/irit-rst-dt
def main(_):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    tdir = current_tmp()
    call(["rst-dt-learning", "extract", TRAINING_CORPUS, PTB_DIR, tdir,
          '--feature_set', FEATURE_SET])
    with open(os.path.join(tdir, "versions-gather.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)
    latest_dir = latest_tmp()
    force_symlink(os.path.basename(tdir), latest_dir)
コード例 #14
0
def _create_snapshot_dir(data_dir):
    """
    Instantiate a snapshot dir and return its path
    """

    bname = fp.basename(os.readlink(data_dir))
    snap_dir = fp.join(SNAPSHOTS, bname)
    if not fp.exists(snap_dir):
        os.makedirs(snap_dir)
        link_files(data_dir, snap_dir)
        force_symlink(bname, latest_snap())
    with open(fp.join(snap_dir, "versions-model.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)
    return snap_dir
コード例 #15
0
ファイル: model.py プロジェクト: moreymat/irit-stac
def _create_snapshot_dir(data_dir):
    """
    Instantiate a snapshot dir and return its path
    """

    bname = fp.basename(os.readlink(data_dir))
    snap_dir = fp.join(SNAPSHOTS, bname)
    if not fp.exists(snap_dir):
        os.makedirs(snap_dir)
        link_files(data_dir, snap_dir)
        force_symlink(bname, latest_snap())
    with open(fp.join(snap_dir, "versions-model.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)
    return snap_dir
コード例 #16
0
ファイル: evaluate.py プロジェクト: phimit/irit-rst-dt
def _mk_report(parent_dir, lconf, idx_file):
    "Generate reports for scores"
    score_prefix = _score_file_path_prefix(parent_dir, lconf)
    json_file = score_prefix + ".json"
    pretty_file = score_prefix + ".txt"

    with open(pretty_file, "w") as pretty_stream:
        call(["attelo", "report",
              idx_file,
              "--json", json_file],
             stdout=pretty_stream)

    print("Scores summarised in %s" % pretty_file,
          file=sys.stderr)
コード例 #17
0
ファイル: count.py プロジェクト: popescuv/irit-stac
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpora = [TRAINING_CORPUS]
    odir = get_output_dir(args)
    for corpus in corpora:
        ofilename = fp.join(odir, fp.basename(corpus) + ".txt")
        with open(ofilename, 'w') as ofile:
            call(["stac-util", "count", corpus, "--annotator", ANNOTATORS],
                 stdout=ofile)
    announce_output_dir(odir)
コード例 #18
0
ファイル: count.py プロジェクト: eipiplusun/irit-stac
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpora = [TRAINING_CORPUS]
    odir = get_output_dir(args)
    for corpus in corpora:
        ofilename = fp.join(odir, fp.basename(corpus) + ".txt")
        with open(ofilename, 'w') as ofile:
            call(["stac-util", "count", corpus,
                  "--annotator", ANNOTATORS],
                 stdout=ofile)
    announce_output_dir(odir)
コード例 #19
0
ファイル: evaluate.py プロジェクト: kowey/attelo
def prepare_dirs(runcfg, data_dir):
    """
    Return eval and scatch directory paths
    """
    eval_prefix = fp.join(data_dir, "eval")
    scratch_prefix = fp.join(data_dir, "scratch")

    eval_current = eval_prefix + '-current'
    scratch_current = scratch_prefix + '-current'
    stage = runcfg.stage

    if (runcfg.mode == 'resume' or stage in [ClusterStage.main,
                                             ClusterStage.combined_models,
                                             ClusterStage.end]):
        if not fp.exists(eval_current) or not fp.exists(scratch_current):
            sys.exit("No currently running evaluation to resume!")
        else:
            eval_dir = fp.realpath(eval_current)
            scratch_dir = fp.realpath(scratch_current)
            # in case there are any new data files to link
            _link_data_files(data_dir, eval_dir)
            return eval_dir, scratch_dir
    else:
        eval_actual_old = fp.realpath(eval_current)
        scratch_actual_old = fp.realpath(scratch_current)
        tstamp = timestamp()
        if _create_tstamped_dir(eval_prefix, tstamp):
            eval_dir = eval_prefix + '-' + tstamp
            scratch_dir = scratch_prefix + '-' + tstamp
            _create_tstamped_dir(scratch_prefix, tstamp)
            _link_data_files(data_dir, eval_dir)
            if runcfg.stage == 'jumpstart':
                _link_fold_files(eval_actual_old, eval_dir)
                _link_model_files(scratch_actual_old, scratch_dir)
        else:
            sys.exit("Try again in one minute")

        with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream:
            call(["pip", "freeze"], stdout=stream)

        return eval_dir, scratch_dir
コード例 #20
0
def prepare_dirs(runcfg, data_dir):
    """
    Return eval and scatch directory paths
    """
    eval_prefix = fp.join(data_dir, "eval")
    scratch_prefix = fp.join(data_dir, "scratch")

    eval_current = eval_prefix + '-current'
    scratch_current = scratch_prefix + '-current'
    stage = runcfg.stage

    if (runcfg.mode == 'resume' or stage in [
            ClusterStage.main, ClusterStage.combined_models, ClusterStage.end
    ]):
        if not fp.exists(eval_current) or not fp.exists(scratch_current):
            sys.exit("No currently running evaluation to resume!")
        else:
            eval_dir = fp.realpath(eval_current)
            scratch_dir = fp.realpath(scratch_current)
            # in case there are any new data files to link
            _link_data_files(data_dir, eval_dir)
            return eval_dir, scratch_dir
    else:
        eval_actual_old = fp.realpath(eval_current)
        scratch_actual_old = fp.realpath(scratch_current)
        tstamp = timestamp()
        if _create_tstamped_dir(eval_prefix, tstamp):
            eval_dir = eval_prefix + '-' + tstamp
            scratch_dir = scratch_prefix + '-' + tstamp
            _create_tstamped_dir(scratch_prefix, tstamp)
            _link_data_files(data_dir, eval_dir)
            if runcfg.stage == 'jumpstart':
                _link_fold_files(eval_actual_old, eval_dir)
                _link_model_files(scratch_actual_old, scratch_dir)
        else:
            sys.exit("Try again in one minute")

        with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream:
            call(["pip", "freeze"], stdout=stream)

        return eval_dir, scratch_dir
コード例 #21
0
ファイル: evaluate.py プロジェクト: padenis/irit-rst-dt
def _create_eval_dirs(args, data_dir, jumpstart):
    """
    Return eval and scatch directory paths
    """

    eval_current = fp.join(data_dir, "eval-current")
    scratch_current = fp.join(data_dir, "scratch-current")
    stage = args_to_stage(args)

    if args.resume or stage in [ClusterStage.main,
                                ClusterStage.combined_models,
                                ClusterStage.end]:
        if not fp.exists(eval_current) or not fp.exists(scratch_current):
            sys.exit("No currently running evaluation to resume!")
        else:
            return eval_current, scratch_current
    else:
        tstamp = "TEST" if _DEBUG else timestamp()
        eval_dir = fp.join(data_dir, "eval-" + tstamp)
        if not fp.exists(eval_dir):
            os.makedirs(eval_dir)
            _link_data_files(data_dir, eval_dir)
            force_symlink(fp.basename(eval_dir), eval_current)
        elif not _DEBUG:
            sys.exit("Try again in one minute")

        scratch_dir = fp.join(data_dir, "scratch-" + tstamp)
        if not fp.exists(scratch_dir):
            os.makedirs(scratch_dir)
            if jumpstart:
                _link_model_files(scratch_current, scratch_dir)
            force_symlink(fp.basename(scratch_dir), scratch_current)

        with open(fp.join(eval_dir, "versions-evaluate.txt"), "w") as stream:
            call(["pip", "freeze"], stdout=stream)

        return eval_dir, scratch_dir
コード例 #22
0
ファイル: gather.py プロジェクト: eipiplusun/irit-stac
def extract_features(corpus, output_dir,
                     vocab_path=None, strip_mode=None):
    """Extract features for a corpus, dump the instances.

    Run feature extraction for a particular corpus; and store the
    results in the output directory. Output file name will be
    computed from the corpus file name.

    This triggers two distinct processes, for pairs of EDUs then for
    single EDUs.

    Parameters
    ----------
    corpus: filepath
        Selected corpus
    output_dir: filepath
        Folder where instances will be dumped
    vocab_path: filepath
        Vocabulary to load for feature extraction (needed if extracting
        test data; must ensure we have the same vocab in test as we'd
        have in training)
    strip_mode: one of {'head', 'broadcast', 'custom'}
        Method to strip CDUs
    """
    # TODO: perhaps we could just directly invoke the appropriate
    # educe module here instead of going through the command line?
    cmd = ["stac-learning", "extract",
           corpus,
           LEX_DIR,
           output_dir,
           "--anno", ANNOTATORS]
    if vocab_path is not None:
        cmd.extend(['--vocabulary', vocab_path])
    if strip_mode is not None:
        cmd.extend(['--strip-mode', strip_mode])
    call(cmd)
    call(cmd + ["--single"])
コード例 #23
0
ファイル: evaluate.py プロジェクト: phimit/irit-rst-dt
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    data_dir = latest_tmp()
    if not os.path.exists(data_dir):
        _exit_ungathered()
    eval_dir, scratch_dir = _create_eval_dirs(args, data_dir)

    with open(os.path.join(eval_dir, "versions.txt"), "w") as stream:
        call(["pip", "freeze"], stdout=stream)

    for corpus in TRAINING_CORPORA:
        dataset = os.path.basename(corpus)
        fold_file = os.path.join(eval_dir,
                                 "folds-%s.json" % dataset)
        lconf = LoopConfig(eval_dir=eval_dir,
                           scratch_dir=scratch_dir,
                           fold_file=fold_file,
                           dataset=dataset)
        _do_corpus(lconf)
コード例 #24
0
ファイル: pipeline.py プロジェクト: eipiplusun/irit-stac
 def pyt(self, script, *args, **kwargs):
     "call python on one of our scripts"
     abs_script = self.abspath(script)
     cmd = ["python", abs_script] + list(args)
     call(cmd, **kwargs)
コード例 #25
0
ファイル: pipeline.py プロジェクト: popescuv/irit-stac
 def pyt(self, script, *args, **kwargs):
     "call python on one of our scripts"
     abs_script = self.abspath(script)
     cmd = ["python", abs_script] + list(args)
     call(cmd, **kwargs)