Ejemplo n.º 1
0
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus, mode=args.strip_mode)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {}  # _read_inquirer_lexicon(args)

    verbnet_entries = [
        VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES
    ]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
Ejemplo n.º 2
0
def _main_enclosure_graph(args):
    """
    Draw graphs showing which annotations' spans include the others
    """
    corpus = _read_corpus(args)
    output_dir = get_output_dir(args)
    keys = corpus
    if args.tokens:
        postags = postag.read_tags(corpus, args.corpus)
    else:
        postags = None

    for k in sorted(keys):
        if postags:
            gra_ = stacgraph.EnclosureGraph(corpus[k], postags[k])
        else:
            gra_ = stacgraph.EnclosureGraph(corpus[k])
        if args.reduce:
            gra_.reduce()
        dot_gra = stacgraph.EnclosureDotGraph(gra_)
        if dot_gra.get_nodes():
            dot_gra.set("ratio", "compress")
            write_dot_graph(k, output_dir, dot_gra,
                            run_graphviz=args.draw)
        else:
            print("Skipping %s (empty graph)" % k, file=sys.stderr)
Ejemplo n.º 3
0
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {} #_read_inquirer_lexicon(args)

    verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x)))
                       for x in VERBNET_CLASSES]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
Ejemplo n.º 4
0
def _read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    is_interesting = mk_is_interesting(args, preselected={"stage": ["units"]})
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(), is_interesting)
    corpus = reader.slurp(anno_files, verbose=True)

    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    LEXICON.read(args.resources)
    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=[LEXICON],
                        pdtb_lex=None,
                        verbnet_entries=None,
                        inquirer_lex=None)
Ejemplo n.º 5
0
def _read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    is_interesting = mk_is_interesting(args,
                                       preselected={"stage": ["units"]})
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(), is_interesting)
    corpus = reader.slurp(anno_files, verbose=True)

    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    LEXICON.read(args.resources)
    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=[LEXICON],
                        pdtb_lex=None,
                        verbnet_entries=None,
                        inquirer_lex=None)