def get_texts(doi_files, solr_url, fields, text_dir, hash_tags=[], resume=False, max_n=None): Path(text_dir).makedirs_p() n = 0 # Use session to deal with the following exception when running many instances in parallel: # # NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7efc8e600668>: # Failed to establish a new connection: [Errno 99] Cannot assign requested address',) # # See http://stackoverflow.com/questions/30943866/requests-cannot-assign-requested-address-out-of-ports with requests.Session() as session: for doi_fname in file_list(doi_files): log.info('getting text sources from {!r}'.format(doi_fname)) for line in open(doi_fname): try: core, doi = line.split() except ValueError: log.error('ill-formed line in file {!r}:\n' '{}'.format(doi_fname, line)) continue try: get_text(session, core, doi, fields, hash_tags, resume, solr_url, text_dir) except DOIError: continue n += 1 if n == max_n: log.info('reached max_n={}'.format(n)) break
def get_solr_sources(xml_files, in_dir): """ Convert output of IR step (article sources in XMl format) to Megamouth input files (tab-separated text file where each line contains the name of a Solr core and the DOI of a source article). """ in_dir = Path(in_dir) in_dir.makedirs_p() for xml_file in file_list(xml_files): name = Path(xml_file).name # arbitrary mapping from filenames to Solr cores if name.startswith('elsevier'): core = 'oc-elsevier-art' elif name.startswith('macmillan'): core = 'oc-macmillan-art' elif name.startswith('wiley'): core = 'oc-wiley-art' elif name.startswith('springer'): core = 'oc-springer-art' else: raise ValueError('undefined core for file ' + xml_file) tsv_file = in_dir + '/' + Path(xml_file).name + '.tsv' # XML is ill-formed (incomplete entities etc.) # so do not use an XML parser with open(xml_file) as inf, open(tsv_file, 'w') as outf: log.info('creating ' + tsv_file) for line in inf: if line.lstrip().startswith('<doi>'): doi = '/'.join(line.split('<')[-2].split('/')[-2:]) print(core, doi, sep='\t', file=outf)
def extract_parse_trees(scnlp_files, parse_dir): """ extract parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for parse_elem in nlp_doc.findall(".//parse"): parse_file.write(parse_elem.text + "\n")
def extract_lemmatized_parse_trees(scnlp_files, parse_dir): """ extract lemmatzied parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for sentence_elem in nlp_doc.iterfind(".//sentence"): lemmas = sentence_elem.iterfind("tokens/token/lemma") word_parse = sentence_elem.find("parse").text.strip() lemma_parse = " ".join(_lemmatized_node(node, lemmas) for node in word_parse.split(" ")) parse_file.write(lemma_parse + "\n")
def core_nlp(input, out_dir=OUT_DIR, annotators=ANNOTATORS, class_path=CLASS_PATH, version=VERSION, memory=MEMORY, threads=THREADS, replace_ext=REPLACE_EXT, output_ext=OUTPUT_EXT, options=OPTIONS, stamp=STAMP, resume=RESUME, use_sr_parser=USE_SR_PARSER): """ Run Stanford CoreNLP Parameters ---------- input out_dir annotators class_path version memory threads replace_ext output_ext options stamp resume use_sr_parser Returns ------- """ in_files = file_list(input) make_dir(out_dir) cmd = ['java'] if memory: cmd.append('-Xmx' + memory) if class_path: class_path = '"{}"'.format(join(class_path or '.', "*")) cmd.append('-cp ' + class_path) cmd.append('edu.stanford.nlp.pipeline.StanfordCoreNLP') if annotators: cmd.append('-annotators ' + annotators) if stamp: replace_ext = True output_ext = '#scnlp_v{}{}'.format(version or '', output_ext) if replace_ext: cmd.append('-replaceExtension') if output_ext: cmd.append('-outputExtension "{}"'.format(output_ext)) if out_dir: cmd.append('-outputDirectory ' + out_dir) if threads: cmd.append('-threads {}'.format(threads)) if resume: in_files = [fname for fname in in_files if not derive_path(fname, new_dir=out_dir, new_ext=output_ext).exists()] if options: cmd.append(options) if 'parse' in annotators and use_sr_parser: cmd.append( '-parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz') # create a temporary file with input filenames tmp_file = NamedTemporaryFile("wt", buffering=1) tmp_file.write('\n'.join(in_files) + "\n") cmd.append('-filelist ' + tmp_file.name) cmd = ' '.join(cmd) log.info('\n' + cmd) ret = check_output(cmd, shell=True, universal_newlines=True) log.info('\n' + ret) return ret