Python KeyGroupVectorizer Examples

Programming Language: Python

Namespace/Package Name: educe.learning.keygroup_vectorizer

Examples at hotexamples.com: 5

Python KeyGroupVectorizer - 5 examples found. These are the top rated real world Python examples of educe.learning.keygroup_vectorizer.KeyGroupVectorizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

KeyGroupVectorizer(2)

fit_transform(2)

transform(1)

vocabulary_(1)

Example #1

Show file

File: extract.py Project: Sablayrolles/debates

def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)

Example #2

Show file

File: extract.py Project: Sablayrolles/debates

def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.relations.sparse')
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)

Example #3

Show file

File: extract.py Project: eipiplusun/educe

def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)

Example #4

Show file

File: extract.py Project: irit-melodi/educe

def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # X, y follow the naming convention in sklearn
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    # create directory structure
    outdir = args.output
    if not fp.exists(outdir):
        os.makedirs(outdir)

    corpus_name = fp.basename(args.corpus)
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(
        outdir,
        '{corpus_name}.relations.sparse'.format(
            corpus_name=corpus_name))

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = fp.join(outdir,
                         '{corpus_name}.relations.sparse.vocab'.format(
                             corpus_name=corpus_name))
    dump_vocabulary(vzer.vocabulary_, vocab_file)

Example #5

Show file

File: extract.py Project: kowey/educe

def main_pairs(args):
    """
    The usual main. Extract feature vectors from the corpus
    """
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output, fp.basename(args.corpus))
    out_file += '.relations.sparse'
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen,
             y_gen,
             out_file,
             labtor.labelset_,
             dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)