Python Reader.Reader Examples

Programming Language: Python

Namespace/Package Name: educe.rst_dt.corpus

Class/Type: Reader

Method/Function: Reader

Examples at hotexamples.com: 3

Python Reader.Reader - 3 examples found. These are the top rated real world Python examples of educe.rst_dt.corpus.Reader.Reader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

slurp(5)

Reader(3)

Frequently Used Methods

slurp (5)

Reader (3)

Example #1

Show file

File: rstdt_to_deps.py Project: moreymat/educe

def dump_dep_rstdt(corpus_dir, out_dir, nary_enc):
    """Convert and dump the RST-DT corpus as dependency trees."""
    # convert and dump RST trees from train
    dir_train = os.path.join(corpus_dir, TRAIN_FOLDER)
    if not os.path.isdir(dir_train):
        raise ValueError('No such folder: {}'.format(dir_train))
    reader_train = Reader(dir_train)
    trees_train = reader_train.slurp()
    dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                       nary_enc=nary_enc)
                    for doc_name, rst_tree in trees_train.items()}
    dump_disdep_files(dtrees_train.values(),
                      os.path.join(out_dir, os.path.basename(dir_train)))

    # convert and dump RST trees from test
    dir_test = os.path.join(corpus_dir, TEST_FOLDER)
    if not os.path.isdir(dir_test):
        raise ValueError('No such folder: {}'.format(dir_test))
    reader_test = Reader(dir_test)
    trees_test = reader_test.slurp()
    dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                      nary_enc=nary_enc)
                   for doc_name, rst_tree in trees_test.items()}
    dump_disdep_files(dtrees_test.values(),
                      os.path.join(out_dir, os.path.basename(dir_test)))

Example #2

Show file

    # properly recast strip_accents if None
    strip_accents = (args.strip_accents if args.strip_accents != 'None'
                     else None)
    lowercase = args.lowercase
    stop_words = (args.stop_words if args.stop_words != 'None'
                  else None)
    outfile = args.outfile
    n_jobs = args.n_jobs
    verbose = args.verbose
    sel_pairs = args.pairs
    distance_range = (args.scale if args.scale != 'None'
                      else None)

    # * read the corpus
    rst_corpus_dir = RST_CORPUS['double']
    rst_reader = Reader(rst_corpus_dir)
    rst_corpus = rst_reader.slurp(verbose=True)
    corpus_texts = [v.text() for k, v in sorted(rst_corpus.items())]

    # MOVE ~ WMD.__init__()
    # load word embeddings
    vocab_dict, W = load_embedding("embed")
    # end MOVE

    # MOVE ~ WMD.fit(corpus_texts?)
    # fit CountVectorizer to the vocabulary of the corpus
    vect = CountVectorizer(
        strip_accents=strip_accents, lowercase=lowercase,
        stop_words=stop_words
    ).fit(corpus_texts)
    # compute the vocabulary common to the embeddings and corpus, restrict

Example #3

Show file

File: check_tokenization.py Project: moreymat/educe

    if not os.path.exists(PTB_DIR):
        raise ValueError("Unable to find PTB dir {}".format(PTB_DIR))
    if not os.path.exists(RST_DIR):
        raise ValueError("Unable to find RST dir {}".format(RST_DIR))
    if not os.path.exists(CORENLP_OUT_DIR):
        raise ValueError(
            "Unable to find parsed dir {}".format(CORENLP_OUT_DIR))

    corpus = 'RSTtrees-WSJ-main-1.0/TRAINING'
    corpus_dir = os.path.join(RST_DIR, corpus)
    # syntactic parsers to compare
    ptb_reader = BracketParseCorpusReader(PTB_DIR,
                                          r'../wsj_.*\.mrg',
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)