def main_single(args): """Extract feature vectors for single EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'unannotated' if args.parsing else 'units' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.dialogue-acts.sparse') instance_generator = lambda x: x.edus[1:] # drop fake root # pylint: disable=invalid-name # scikit-convention feats = extract_single_features(inputs, stage) vzer = KeyGroupVectorizer() # TODO? just transform() if args.parsing or args.vocabulary? X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) # list dialogue acts comment = labels_comment(labtor.labelset_) # dump: EDUs, pairings, vectorized pairings with label edu_input_file = out_file + '.edu_input' dump_edu_input_file(dialogues, edu_input_file) dump_svmlight_file(X_gen, y_gen, out_file, comment=comment) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus) + '.relations.sparse') instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """Extract feature vectors for pairs of EDUs in the corpus.""" inputs = read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # X, y follow the naming convention in sklearn feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) # create directory structure outdir = args.output if not fp.exists(outdir): os.makedirs(outdir) corpus_name = fp.basename(args.corpus) # these paths should go away once we switch to a proper dumper out_file = fp.join( outdir, '{corpus_name}.relations.sparse'.format( corpus_name=corpus_name)) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = fp.join(outdir, '{corpus_name}.relations.sparse.vocab'.format( corpus_name=corpus_name)) dump_vocabulary(vzer.vocabulary_, vocab_file)
def main_pairs(args): """ The usual main. Extract feature vectors from the corpus """ inputs = features.read_corpus_inputs(args) stage = 'units' if args.parsing else 'discourse' dialogues = list(mk_high_level_dialogues(inputs, stage)) # these paths should go away once we switch to a proper dumper out_file = fp.join(args.output, fp.basename(args.corpus)) out_file += '.relations.sparse' instance_generator = lambda x: x.edu_pairs() labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS) # pylint: disable=invalid-name # scikit-convention feats = extract_pair_features(inputs, stage) vzer = KeyGroupVectorizer() if args.parsing or args.vocabulary: vzer.vocabulary_ = load_vocabulary(args.vocabulary) X_gen = vzer.transform(feats) else: X_gen = vzer.fit_transform(feats) # pylint: enable=invalid-name labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing) y_gen = labtor.transform(dialogues) if not fp.exists(args.output): os.makedirs(args.output) dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main(args): "main for feature extraction mode" # retrieve parameters feature_set = args.feature_set live = args.parsing # RST data rst_reader = RstDtParser(args.corpus, args, coarse_rels=True) rst_corpus = rst_reader.corpus # TODO: change educe.corpus.Reader.slurp*() so that they return an object # which contains a *list* of FileIds and a *list* of annotations # (see sklearn's Bunch) # on creation of these lists, one can impose the list of names to be # sorted so that the order in which docs are iterated is guaranteed # to be always the same # PTB data ptb_parser = PtbParser(args.ptb) # align EDUs with sentences, tokens and trees from PTB def open_plus(doc): """Open and fully load a document doc is an educe.corpus.FileId """ # create a DocumentPlus doc = rst_reader.decode(doc) # populate it with layers of info # tokens doc = ptb_parser.tokenize(doc) # syn parses doc = ptb_parser.parse(doc) # disc segments doc = rst_reader.segment(doc) # disc parse doc = rst_reader.parse(doc) # pre-compute the relevant info for each EDU doc = doc.align_with_doc_structure() # logical order is align with tokens, then align with trees # but aligning with trees first for the PTB enables # to get proper sentence segmentation doc = doc.align_with_trees() doc = doc.align_with_tokens() # dummy, fallback tokenization if there is no PTB gold or silver doc = doc.align_with_raw_words() return doc # generate DocumentPluses # TODO remove sorted() once educe.corpus.Reader is able # to iterate over a stable (sorted) list of FileIds docs = [open_plus(doc) for doc in sorted(rst_corpus)] # instance generator instance_generator = lambda doc: doc.all_edu_pairs() # extract vectorized samples if args.vocabulary is not None: vocab = load_vocabulary(args.vocabulary) vzer = DocumentCountVectorizer(instance_generator, feature_set, vocabulary=vocab) X_gen = vzer.transform(docs) else: vzer = DocumentCountVectorizer(instance_generator, feature_set, min_df=5) X_gen = vzer.fit_transform(docs) # extract class label for each instance if live: y_gen = itertools.repeat(0) elif args.labels is not None: labelset = load_labels(args.labels) labtor = DocumentLabelExtractor(instance_generator, labelset=labelset) labtor.fit(docs) y_gen = labtor.transform(docs) else: labtor = DocumentLabelExtractor(instance_generator) # y_gen = labtor.fit_transform(rst_corpus) # fit then transform enables to get classes_ for the dump labtor.fit(docs) y_gen = labtor.transform(docs) # dump instances to files if not os.path.exists(args.output): os.makedirs(args.output) # data file of_ext = '.sparse' if live: out_file = os.path.join(args.output, 'extracted-features' + of_ext) else: of_bn = os.path.join(args.output, os.path.basename(args.corpus)) out_file = '{}.relations{}'.format(of_bn, of_ext) # dump dump_all(X_gen, y_gen, out_file, labtor.labelset_, docs, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)
def main(args): "main for feature extraction mode" # retrieve parameters feature_set = args.feature_set live = args.parsing # NEW lecsie features lecsie_data_dir = args.lecsie_data_dir # RST data # fileX docs are currently not supported by CoreNLP exclude_file_docs = args.corenlp_out_dir rst_reader = RstDtParser(args.corpus, args, coarse_rels=args.coarse, fix_pseudo_rels=args.fix_pseudo_rels, exclude_file_docs=exclude_file_docs) rst_corpus = rst_reader.corpus # TODO: change educe.corpus.Reader.slurp*() so that they return an object # which contains a *list* of FileIds and a *list* of annotations # (see sklearn's Bunch) # on creation of these lists, one can impose the list of names to be # sorted so that the order in which docs are iterated is guaranteed # to be always the same # syntactic preprocessing if args.corenlp_out_dir: # get the precise path to CoreNLP parses for the corpus currently used # the folder layout of CoreNLP's output currently follows that of the # corpus: RSTtrees-main-1.0/{TRAINING,TEST}, RSTtrees-double-1.0 # FIXME clean rewrite ; this could mean better modelling of the corpus # subparts/versions, e.g. RST corpus have "version: 1.0", annotators # "main" or "double" # find the suffix of the path name that starts with RSTtrees-* # FIXME find a cleaner way to do this ; # should probably use pathlib, included in the standard lib # for python >= 3.4 try: rel_idx = (args.corpus).index('RSTtrees-WSJ-') except ValueError: # if no part of the path starts with "RSTtrees", keep the # entire path (no idea whether this is good) relative_corpus_path = args.corpus else: relative_corpus_path = args.corpus[rel_idx:] corenlp_out_dir = os.path.join(args.corenlp_out_dir, relative_corpus_path) csyn_parser = CoreNlpParser(corenlp_out_dir) else: # TODO improve switch between gold and predicted syntax # PTB data csyn_parser = PtbParser(args.ptb) # FIXME print('offline syntactic preprocessing: ready') # align EDUs with sentences, tokens and trees from PTB def open_plus(doc): """Open and fully load a document doc is an educe.corpus.FileId """ # create a DocumentPlus doc = rst_reader.decode(doc) # populate it with layers of info # tokens doc = csyn_parser.tokenize(doc) # syn parses doc = csyn_parser.parse(doc) # disc segments doc = rst_reader.segment(doc) # disc parse doc = rst_reader.parse(doc) # pre-compute the relevant info for each EDU doc = doc.align_with_doc_structure() # logical order is align with tokens, then align with trees # but aligning with trees first for the PTB enables # to get proper sentence segmentation doc = doc.align_with_trees() doc = doc.align_with_tokens() # dummy, fallback tokenization if there is no PTB gold or silver doc = doc.align_with_raw_words() return doc # generate DocumentPluses # TODO remove sorted() once educe.corpus.Reader is able # to iterate over a stable (sorted) list of FileIds docs = [open_plus(doc) for doc in sorted(rst_corpus)] # instance generator instance_generator = lambda doc: doc.all_edu_pairs() split_feat_space = 'dir_sent' # extract vectorized samples if args.vocabulary is not None: vocab = load_vocabulary(args.vocabulary) vzer = DocumentCountVectorizer(instance_generator, feature_set, lecsie_data_dir=lecsie_data_dir, vocabulary=vocab, split_feat_space=split_feat_space) X_gen = vzer.transform(docs) else: vzer = DocumentCountVectorizer(instance_generator, feature_set, lecsie_data_dir=lecsie_data_dir, min_df=5, split_feat_space=split_feat_space) X_gen = vzer.fit_transform(docs) # extract class label for each instance if live: y_gen = itertools.repeat(0) elif args.labels is not None: labelset = load_labels(args.labels) labtor = DocumentLabelExtractor(instance_generator, labelset=labelset) labtor.fit(docs) y_gen = labtor.transform(docs) else: labtor = DocumentLabelExtractor(instance_generator) # y_gen = labtor.fit_transform(rst_corpus) # fit then transform enables to get classes_ for the dump labtor.fit(docs) y_gen = labtor.transform(docs) # dump instances to files if not os.path.exists(args.output): os.makedirs(args.output) # data file of_ext = '.sparse' if live: out_file = os.path.join(args.output, 'extracted-features' + of_ext) else: of_bn = os.path.join(args.output, os.path.basename(args.corpus)) out_file = '{}.relations{}'.format(of_bn, of_ext) # dump EDUs and features in svmlight format dump_all(X_gen, y_gen, out_file, labtor.labelset_, docs, instance_generator) # dump vocabulary vocab_file = out_file + '.vocab' dump_vocabulary(vzer.vocabulary_, vocab_file)