def dump_dep_rstdt(corpus_dir, out_dir, nary_enc): """Convert and dump the RST-DT corpus as dependency trees.""" # convert and dump RST trees from train dir_train = os.path.join(corpus_dir, TRAIN_FOLDER) if not os.path.isdir(dir_train): raise ValueError('No such folder: {}'.format(dir_train)) reader_train = Reader(dir_train) trees_train = reader_train.slurp() dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree, nary_enc=nary_enc) for doc_name, rst_tree in trees_train.items()} dump_disdep_files(dtrees_train.values(), os.path.join(out_dir, os.path.basename(dir_train))) # convert and dump RST trees from test dir_test = os.path.join(corpus_dir, TEST_FOLDER) if not os.path.isdir(dir_test): raise ValueError('No such folder: {}'.format(dir_test)) reader_test = Reader(dir_test) trees_test = reader_test.slurp() dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree, nary_enc=nary_enc) for doc_name, rst_tree in trees_test.items()} dump_disdep_files(dtrees_test.values(), os.path.join(out_dir, os.path.basename(dir_test)))
# properly recast strip_accents if None strip_accents = (args.strip_accents if args.strip_accents != 'None' else None) lowercase = args.lowercase stop_words = (args.stop_words if args.stop_words != 'None' else None) outfile = args.outfile n_jobs = args.n_jobs verbose = args.verbose sel_pairs = args.pairs distance_range = (args.scale if args.scale != 'None' else None) # * read the corpus rst_corpus_dir = RST_CORPUS['double'] rst_reader = Reader(rst_corpus_dir) rst_corpus = rst_reader.slurp(verbose=True) corpus_texts = [v.text() for k, v in sorted(rst_corpus.items())] # MOVE ~ WMD.__init__() # load word embeddings vocab_dict, W = load_embedding("embed") # end MOVE # MOVE ~ WMD.fit(corpus_texts?) # fit CountVectorizer to the vocabulary of the corpus vect = CountVectorizer( strip_accents=strip_accents, lowercase=lowercase, stop_words=stop_words ).fit(corpus_texts) # compute the vocabulary common to the embeddings and corpus, restrict
if not os.path.exists(PTB_DIR): raise ValueError("Unable to find PTB dir {}".format(PTB_DIR)) if not os.path.exists(RST_DIR): raise ValueError("Unable to find RST dir {}".format(RST_DIR)) if not os.path.exists(CORENLP_OUT_DIR): raise ValueError( "Unable to find parsed dir {}".format(CORENLP_OUT_DIR)) corpus = 'RSTtrees-WSJ-main-1.0/TRAINING' corpus_dir = os.path.join(RST_DIR, corpus) # syntactic parsers to compare ptb_reader = BracketParseCorpusReader(PTB_DIR, r'../wsj_.*\.mrg', encoding='ascii') # read the RST corpus rst_reader = Reader(corpus_dir) rst_corpus = rst_reader.slurp() # for each file, compare tokenizations between PTB and CoreNLP for key, rst_tree in sorted(rst_corpus.items()): doc_name = key.doc.split('.', 1)[0] if doc_name.startswith('wsj_'): print(doc_name) doc_wsj_num = doc_name.split('_')[1] section = doc_wsj_num[:2] # corenlp stuff core_fname = os.path.join(CORENLP_OUT_DIR, corpus, doc_name + '.out.xml') core_reader = PreprocessingSource() core_reader.read(core_fname, suffix='') corenlp_doc = read_corenlp_result(None, core_reader)