def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.relations.sparse') instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_single(args): """Extract feature vectors for single EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'unannotated' if args.parsing else 'units' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.dialogue-acts.sparse') instance_generator = lambda x: x.edus[1:] # drop fake root # pylint: disable=invalid-name # scikit-convention feats = extract_single_features(inputs, stage) vzer = KeyGroupVectorizer() # TODO? just transform() if args.parsing or args.vocabulary? X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) # list dialogue acts comment = labels_comment(labtor.labelset_) # dump: EDUs, pairings, vectorized pairings with label edu_input_file = out_file + '.edu_input' dump_edu_input_file(dialogues, edu_input_file) dump_svmlight_file(X_gen, y_gen, out_file, comment=comment) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # X, y follow the naming convention in sklearn feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) # create directory structure outdir = args.output if not fp.exists(outdir): os.makedirs(outdir) corpus_name = fp.basename(args.corpus) # these paths should go away once we switch to a proper dumper out_file = fp.join( outdir, '{corpus_name}.relations.sparse'.format( corpus_name=corpus_name)) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = fp.join(outdir, '{corpus_name}.relations.sparse.vocab'.format( corpus_name=corpus_name)) dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_parsing_pairs(args): """ Main to call when live data are passed in (--parsing). Live data are data that we want to discourse parsing on, so we don't know if they are attached or what the label is. As of 2014-08-19, there must be an 'unannotated' stage and an optional 'units' stage (for dialogue acts) """ inputs = features.read_corpus_inputs(args, stage='units|unannotated') features_file = os.path.join(args.output, 'extracted-features.csv') with codecs.open(features_file, 'wb') as ofile: header = features.PairKeys(inputs) writer = mk_csv_writer(header, ofile) feats = features.extract_pair_features(inputs, args.window, live=True) for row, _ in feats: writer.writerow(row)
def main_pairs(args): """ The usual main. Extract feature vectors from the corpus """ inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus)) out_file += '.relations.sparse' instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_corpus_single(args): """ The usual main. Extract feature vectors from the corpus (single edus only) """ inputs = features.read_corpus_inputs(args) of_bn = os.path.join(args.output, os.path.basename(args.corpus)) of_ext = '.csv' if not os.path.exists(args.output): os.makedirs(args.output) just_edus_file = of_bn + '.just-edus' + of_ext with codecs.open(just_edus_file, 'wb') as ofile: gen = features.extract_single_features(inputs) try: _write_singles(gen, ofile) except StopIteration: # FIXME: I have a nagging feeling that we should properly # support this by just printing a CSV header and nothing # else, but I'm trying to minimise code paths and for now # failing in this corner case feels like a lesser evil :-/ sys.exit("No features to extract!")
def command_annotate(args): """ Top-level command: given a dialogue act model, and a corpus with some Glozz documents, perform dialogue act annotation on them, and simple addressee detection, and dump Glozz documents in the output directory """ args.ignore_cdus = False args.parsing = True args.single = True args.strip_mode = 'head' # FIXME should not be specified here inputs = stac_features.read_corpus_inputs(args) model = joblib.load(args.model) vocab = {f: i for i, f in enumerate(load_vocab(args.vocabulary))} labels = load_labels(args.labels) # add dialogue acts and addressees annotate_edus(model, vocab, labels, inputs) # corpus has been modified in-memory, now save to disk for key in inputs.corpus: key2 = _output_key(key) doc = inputs.corpus[key] save_document(args.output, key2, doc)
def command_annotate(args): """ Top-level command: given a dialogue act model, and a corpus with some Glozz documents, perform dialogue act annotation on them, and simple addressee detection, and dump Glozz documents in the output directory """ args.ignore_cdus = False args.parsing = True args.single = True inputs = stac_features.read_corpus_inputs(args) model = load_model(args.model) vocab = {f: i for i, f in enumerate(load_vocab(args.vocabulary))} labels = load_labels(args.labels) # add dialogue acts and addressees annotate_edus(model, vocab, labels, inputs) # corpus has been modified in-memory, now save to disk for key in inputs.corpus: key2 = _output_key(key) doc = inputs.corpus[key] save_document(args.output, key2, doc)