Example #1
0
def dump_dep_rstdt(corpus_dir, out_dir, nary_enc):
    """Convert and dump the RST-DT corpus as dependency trees."""
    # convert and dump RST trees from train
    dir_train = os.path.join(corpus_dir, TRAIN_FOLDER)
    if not os.path.isdir(dir_train):
        raise ValueError('No such folder: {}'.format(dir_train))
    reader_train = Reader(dir_train)
    trees_train = reader_train.slurp()
    dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                       nary_enc=nary_enc)
                    for doc_name, rst_tree in trees_train.items()}
    dump_disdep_files(dtrees_train.values(),
                      os.path.join(out_dir, os.path.basename(dir_train)))

    # convert and dump RST trees from test
    dir_test = os.path.join(corpus_dir, TEST_FOLDER)
    if not os.path.isdir(dir_test):
        raise ValueError('No such folder: {}'.format(dir_test))
    reader_test = Reader(dir_test)
    trees_test = reader_test.slurp()
    dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                      nary_enc=nary_enc)
                   for doc_name, rst_tree in trees_test.items()}
    dump_disdep_files(dtrees_test.values(),
                      os.path.join(out_dir, os.path.basename(dir_test)))
Example #2
0
    # properly recast strip_accents if None
    strip_accents = (args.strip_accents if args.strip_accents != 'None'
                     else None)
    lowercase = args.lowercase
    stop_words = (args.stop_words if args.stop_words != 'None'
                  else None)
    outfile = args.outfile
    n_jobs = args.n_jobs
    verbose = args.verbose
    sel_pairs = args.pairs
    distance_range = (args.scale if args.scale != 'None'
                      else None)

    # * read the corpus
    rst_corpus_dir = RST_CORPUS['double']
    rst_reader = Reader(rst_corpus_dir)
    rst_corpus = rst_reader.slurp(verbose=True)
    corpus_texts = [v.text() for k, v in sorted(rst_corpus.items())]

    # MOVE ~ WMD.__init__()
    # load word embeddings
    vocab_dict, W = load_embedding("embed")
    # end MOVE

    # MOVE ~ WMD.fit(corpus_texts?)
    # fit CountVectorizer to the vocabulary of the corpus
    vect = CountVectorizer(
        strip_accents=strip_accents, lowercase=lowercase,
        stop_words=stop_words
    ).fit(corpus_texts)
    # compute the vocabulary common to the embeddings and corpus, restrict
Example #3
0
    if not os.path.exists(PTB_DIR):
        raise ValueError("Unable to find PTB dir {}".format(PTB_DIR))
    if not os.path.exists(RST_DIR):
        raise ValueError("Unable to find RST dir {}".format(RST_DIR))
    if not os.path.exists(CORENLP_OUT_DIR):
        raise ValueError(
            "Unable to find parsed dir {}".format(CORENLP_OUT_DIR))

    corpus = 'RSTtrees-WSJ-main-1.0/TRAINING'
    corpus_dir = os.path.join(RST_DIR, corpus)
    # syntactic parsers to compare
    ptb_reader = BracketParseCorpusReader(PTB_DIR,
                                          r'../wsj_.*\.mrg',
                                          encoding='ascii')
    # read the RST corpus
    rst_reader = Reader(corpus_dir)
    rst_corpus = rst_reader.slurp()
    # for each file, compare tokenizations between PTB and CoreNLP
    for key, rst_tree in sorted(rst_corpus.items()):
        doc_name = key.doc.split('.', 1)[0]
        if doc_name.startswith('wsj_'):
            print(doc_name)
            doc_wsj_num = doc_name.split('_')[1]
            section = doc_wsj_num[:2]

            # corenlp stuff
            core_fname = os.path.join(CORENLP_OUT_DIR, corpus,
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)