Beispiel #1
0
def get_texts(doi_files, solr_url, fields, text_dir, hash_tags=[],
              resume=False, max_n=None):
    Path(text_dir).makedirs_p()
    n = 0

    # Use session to deal with the following exception when running many instances in parallel:
    #
    # NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7efc8e600668>:
    # Failed to establish a new connection: [Errno 99] Cannot assign requested address',)
    #
    # See http://stackoverflow.com/questions/30943866/requests-cannot-assign-requested-address-out-of-ports
    with requests.Session() as session:
        for doi_fname in file_list(doi_files):
            log.info('getting text sources from {!r}'.format(doi_fname))
            for line in open(doi_fname):
                try:
                    core, doi = line.split()
                except ValueError:
                    log.error('ill-formed line in file {!r}:\n'
                              '{}'.format(doi_fname, line))
                    continue

                try:
                    get_text(session, core, doi, fields, hash_tags, resume, solr_url, text_dir)
                except DOIError:
                    continue

                n += 1
                if n == max_n:
                    log.info('reached max_n={}'.format(n))
                    break
Beispiel #2
0
def get_solr_sources(xml_files, in_dir):
    """
    Convert output of IR step (article sources in XMl format)
    to Megamouth input files (tab-separated text file where each line contains
    the name of a Solr core and the DOI of a source article).
    """
    in_dir = Path(in_dir)
    in_dir.makedirs_p()

    for xml_file in file_list(xml_files):
        name = Path(xml_file).name
        # arbitrary mapping from filenames to Solr cores
        if name.startswith('elsevier'):
            core = 'oc-elsevier-art'
        elif name.startswith('macmillan'):
            core = 'oc-macmillan-art'
        elif name.startswith('wiley'):
            core = 'oc-wiley-art'
        elif name.startswith('springer'):
            core = 'oc-springer-art'
        else:
            raise ValueError('undefined core for file ' + xml_file)

        tsv_file = in_dir + '/' + Path(xml_file).name + '.tsv'

        # XML is ill-formed (incomplete entities etc.)
        # so do not use an XML parser
        with open(xml_file) as inf, open(tsv_file, 'w') as outf:
            log.info('creating ' + tsv_file)

            for line in inf:
                if line.lstrip().startswith('<doi>'):
                    doi = '/'.join(line.split('<')[-2].split('/')[-2:])
                    print(core, doi, sep='\t', file=outf)
Beispiel #3
0
def get_solr_sources(xml_files, in_dir):
    """
    Convert output of IR step (article sources in XMl format)
    to Megamouth input files (tab-separated text file where each line contains
    the name of a Solr core and the DOI of a source article).
    """
    in_dir = Path(in_dir)
    in_dir.makedirs_p()

    for xml_file in file_list(xml_files):
        name = Path(xml_file).name
        # arbitrary mapping from filenames to Solr cores
        if name.startswith('elsevier'):
            core = 'oc-elsevier-art'
        elif name.startswith('macmillan'):
            core = 'oc-macmillan-art'
        elif name.startswith('wiley'):
            core = 'oc-wiley-art'
        elif name.startswith('springer'):
            core = 'oc-springer-art'
        else:
            raise ValueError('undefined core for file ' + xml_file)

        tsv_file = in_dir + '/' + Path(xml_file).name + '.tsv'

        # XML is ill-formed (incomplete entities etc.)
        # so do not use an XML parser
        with open(xml_file) as inf, open(tsv_file, 'w') as outf:
            log.info('creating ' + tsv_file)

            for line in inf:
                if line.lstrip().startswith('<doi>'):
                    doi = '/'.join(line.split('<')[-2].split('/')[-2:])
                    print(core, doi, sep='\t', file=outf)
Beispiel #4
0
def extract_parse_trees(scnlp_files, parse_dir):
    """
    extract parse trees (PTB labeled bracket structures) from Stanford
    CoreNLP XML ouput
    """
    make_dir(parse_dir)

    for scnlp_fname in file_list(scnlp_files, "*.xml"):
        nlp_doc = ElementTree(file=scnlp_fname)

        parse_fname = derive_path(scnlp_fname,
                                  new_dir=parse_dir,
                                  new_ext='.parse')
        log.info("writing " + parse_fname)

        with open(parse_fname, "wt", encoding="utf-8") as parse_file:
            for parse_elem in nlp_doc.findall(".//parse"):
                parse_file.write(parse_elem.text + "\n")
Beispiel #5
0
def extract_lemmatized_parse_trees(scnlp_files, parse_dir):
    """
    extract lemmatzied parse trees (PTB labeled bracket structures) from
    Stanford CoreNLP XML ouput
    """
    make_dir(parse_dir)

    for scnlp_fname in file_list(scnlp_files, "*.xml"):
        nlp_doc = ElementTree(file=scnlp_fname)

        parse_fname = derive_path(scnlp_fname,
                                  new_dir=parse_dir,
                                  new_ext='.parse')
        log.info("writing " + parse_fname)

        with open(parse_fname, "wt", encoding="utf-8") as parse_file:
            for sentence_elem in nlp_doc.iterfind(".//sentence"):
                lemmas = sentence_elem.iterfind("tokens/token/lemma")
                word_parse = sentence_elem.find("parse").text.strip()
                lemma_parse = " ".join(_lemmatized_node(node, lemmas)
                                       for node in word_parse.split(" "))
                parse_file.write(lemma_parse + "\n")
Beispiel #6
0
def get_texts(doi_files,
              solr_url,
              fields,
              text_dir,
              hash_tags=[],
              resume=False,
              max_n=None):
    Path(text_dir).makedirs_p()
    n = 0

    # Use session to deal with the following exception when running many instances in parallel:
    #
    # NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7efc8e600668>:
    # Failed to establish a new connection: [Errno 99] Cannot assign requested address',)
    #
    # See http://stackoverflow.com/questions/30943866/requests-cannot-assign-requested-address-out-of-ports
    with requests.Session() as session:
        for doi_fname in file_list(doi_files):
            log.info('getting text sources from {!r}'.format(doi_fname))
            for line in open(doi_fname):
                try:
                    core, doi = line.split()
                except ValueError:
                    log.error('ill-formed line in file {!r}:\n'
                              '{}'.format(doi_fname, line))
                    continue

                try:
                    get_text(session, core, doi, fields, hash_tags, resume,
                             solr_url, text_dir)
                except DOIError:
                    continue

                n += 1
                if n == max_n:
                    log.info('reached max_n={}'.format(n))
                    break
Beispiel #7
0
def core_nlp(input,
             out_dir=OUT_DIR,
             annotators=ANNOTATORS,
             class_path=CLASS_PATH,
             version=VERSION,
             memory=MEMORY,
             threads=THREADS,
             replace_ext=REPLACE_EXT,
             output_ext=OUTPUT_EXT,
             options=OPTIONS,
             stamp=STAMP,
             resume=RESUME,
             use_sr_parser=USE_SR_PARSER):
    """
    Run Stanford CoreNLP

    Parameters
    ----------
    input
    out_dir
    annotators
    class_path
    version
    memory
    threads
    replace_ext
    output_ext
    options
    stamp
    resume
    use_sr_parser

    Returns
    -------

    """
    in_files = file_list(input)
    make_dir(out_dir)

    cmd = ['java']

    if memory:
        cmd.append('-Xmx' + memory)

    if class_path:
        class_path = '"{}"'.format(join(class_path or '.', "*"))
        cmd.append('-cp ' + class_path)

    cmd.append('edu.stanford.nlp.pipeline.StanfordCoreNLP')

    if annotators:
        cmd.append('-annotators ' + annotators)

    if stamp:
        replace_ext = True
        output_ext = '#scnlp_v{}{}'.format(version or '', output_ext)

    if replace_ext:
        cmd.append('-replaceExtension')

    if output_ext:
        cmd.append('-outputExtension "{}"'.format(output_ext))

    if out_dir:
        cmd.append('-outputDirectory ' + out_dir)

    if threads:
        cmd.append('-threads {}'.format(threads))

    if resume:
        in_files = [fname for fname in in_files
                    if not derive_path(fname,
                                       new_dir=out_dir,
                                       new_ext=output_ext).exists()]

    if options:
        cmd.append(options)

    if 'parse' in annotators and use_sr_parser:
        cmd.append(
            '-parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz')

    # create a temporary file with input filenames
    tmp_file = NamedTemporaryFile("wt", buffering=1)
    tmp_file.write('\n'.join(in_files) + "\n")

    cmd.append('-filelist ' + tmp_file.name)

    cmd = ' '.join(cmd)
    log.info('\n' + cmd)
    ret = check_output(cmd, shell=True, universal_newlines=True)
    log.info('\n' + ret)

    return ret