Example #1
0
def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Example #2
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.relations.sparse')
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Example #3
0
def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Example #4
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # X, y follow the naming convention in sklearn
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    # create directory structure
    outdir = args.output
    if not fp.exists(outdir):
        os.makedirs(outdir)

    corpus_name = fp.basename(args.corpus)
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(
        outdir,
        '{corpus_name}.relations.sparse'.format(
            corpus_name=corpus_name))

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = fp.join(outdir,
                         '{corpus_name}.relations.sparse.vocab'.format(
                             corpus_name=corpus_name))
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Example #5
0
def main_pairs(args):
    """
    The usual main. Extract feature vectors from the corpus
    """
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output, fp.basename(args.corpus))
    out_file += '.relations.sparse'
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen,
             y_gen,
             out_file,
             labtor.labelset_,
             dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)