def extract_parse_trees(scnlp_files, parse_dir): """ extract parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for parse_elem in nlp_doc.findall(".//parse"): parse_file.write(parse_elem.text + "\n")
def extract_lemmatized_parse_trees(scnlp_files, parse_dir): """ extract lemmatzied parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for sentence_elem in nlp_doc.iterfind(".//sentence"): lemmas = sentence_elem.iterfind("tokens/token/lemma") word_parse = sentence_elem.find("parse").text.strip() lemma_parse = " ".join(_lemmatized_node(node, lemmas) for node in word_parse.split(" ")) parse_file.write(lemma_parse + "\n")
def core_nlp(input, out_dir=OUT_DIR, annotators=ANNOTATORS, class_path=CLASS_PATH, version=VERSION, memory=MEMORY, threads=THREADS, replace_ext=REPLACE_EXT, output_ext=OUTPUT_EXT, options=OPTIONS, stamp=STAMP, resume=RESUME, use_sr_parser=USE_SR_PARSER): """ Run Stanford CoreNLP Parameters ---------- input out_dir annotators class_path version memory threads replace_ext output_ext options stamp resume use_sr_parser Returns ------- """ in_files = file_list(input) make_dir(out_dir) cmd = ['java'] if memory: cmd.append('-Xmx' + memory) if class_path: class_path = '"{}"'.format(join(class_path or '.', "*")) cmd.append('-cp ' + class_path) cmd.append('edu.stanford.nlp.pipeline.StanfordCoreNLP') if annotators: cmd.append('-annotators ' + annotators) if stamp: replace_ext = True output_ext = '#scnlp_v{}{}'.format(version or '', output_ext) if replace_ext: cmd.append('-replaceExtension') if output_ext: cmd.append('-outputExtension "{}"'.format(output_ext)) if out_dir: cmd.append('-outputDirectory ' + out_dir) if threads: cmd.append('-threads {}'.format(threads)) if resume: in_files = [fname for fname in in_files if not derive_path(fname, new_dir=out_dir, new_ext=output_ext).exists()] if options: cmd.append(options) if 'parse' in annotators and use_sr_parser: cmd.append( '-parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz') # create a temporary file with input filenames tmp_file = NamedTemporaryFile("wt", buffering=1) tmp_file.write('\n'.join(in_files) + "\n") cmd.append('-filelist ' + tmp_file.name) cmd = ' '.join(cmd) log.info('\n' + cmd) ret = check_output(cmd, shell=True, universal_newlines=True) log.info('\n' + ret) return ret