def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus, mode=args.strip_mode) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} # _read_inquirer_lexicon(args) verbnet_entries = [ VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES ] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} #_read_inquirer_lexicon(args) verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def _read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ is_interesting = mk_is_interesting(args, preselected={"stage": ["units"]}) reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), is_interesting) corpus = reader.slurp(anno_files, verbose=True) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) LEXICON.read(args.resources) return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=[LEXICON], pdtb_lex=None, verbnet_entries=None, inquirer_lex=None)