def main_single(args): """Extract feature vectors for single EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'unannotated' if args.parsing else 'units' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.dialogue-acts.sparse') instance_generator = lambda x: x.edus[1:] # drop fake root # pylint: disable=invalid-name # scikit-convention feats = extract_single_features(inputs, stage) vzer = KeyGroupVectorizer() # TODO? just transform() if args.parsing or args.vocabulary? X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) # list dialogue acts comment = labels_comment(labtor.labelset_) # dump: EDUs, pairings, vectorized pairings with label edu_input_file = out_file + '.edu_input' dump_edu_input_file(dialogues, edu_input_file) dump_svmlight_file(X_gen, y_gen, out_file, comment=comment) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.relations.sparse') instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # X, y follow the naming convention in sklearn feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) # create directory structure outdir = args.output if not fp.exists(outdir): os.makedirs(outdir) corpus_name = fp.basename(args.corpus) # these paths should go away once we switch to a proper dumper out_file = fp.join( outdir, '{corpus_name}.relations.sparse'.format( corpus_name=corpus_name)) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = fp.join(outdir, '{corpus_name}.relations.sparse.vocab'.format( corpus_name=corpus_name)) dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """ The usual main. Extract feature vectors from the corpus """ inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus)) out_file += '.relations.sparse' instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)