예제 #1
0
파일: text.py 프로젝트: OC-NTNU/megamouth
def get_text(session, core, doi, fields, hash_tags, resume, solr_url, text_dir):
    if doi.startswith('http://dx.doi.org/'):
        doi = doi[18:]
    quoted_doi = quote_doi(doi)
    text_path = derive_path('', new_dir=text_dir, new_corename=quoted_doi, new_ext='txt', append_tags=hash_tags)

    if resume and text_path.exists():
        log.info('skipping file {!r} because it exists'.format(text_path))
    else:
        doc = query_solr(session, solr_url, core, doi, fields)
        values = []
        for key in fields:
            try:
                val = doc[key]
            except KeyError as key:
                log.error('no {!r} field for doi {!r} in core {!r}'.format(key.args[0], doi, core))
                return
            # flatten list values, e.g. for fulltext
            if isinstance(val, list):
                values.append('\n'.join(val))
            else:
                values.append(val)
        text = '\n\n'.join(values)
        log.info('creating text file {!r}'.format(text_path))
        text_path.write_text(text)
예제 #2
0
def tag_var_nodes(vars_dir, trees_dir, tagged_dir):
    """
    Tag variable nodes in tree

    Tag variables nodes in trees with "_VAR:f:n:m:e" suffix where
    f is the name of the parse file,
    n is the tree number,
    m is the variable's node number and
    e is name of the pattern used for extracting this variable.
    Will only output those trees containing at least two variables.
    """
    # At first I used the tregex's '-f' option to print the filename,
    # but when traversing the files in a directory,
    # it prints the wrong filenames (after the first one?),
    # so now the filename is encoded in the node label too.
    tagged_dir = Path(tagged_dir)
    tagged_dir.makedirs_p()

    for vars_fname in Path(vars_dir).glob('*.json'):
        d = defaultdict(list)

        # create a dict mapping each tree number to a list of
        # (nodeNumber, extractName) tuples for its variables
        for record in json.load(vars_fname.open()):
            pair = record['nodeNumber'], record['key']
            d[record['treeNumber']].append(pair)

        lemtree_fname = record['filename']
        parses = (Path(trees_dir) / lemtree_fname).lines()
        tagged_parses = []

        for tree_number, pairs in d.items():
            if len(pairs) > 1:
                # tree numbers in records count from one
                tree = Tree.fromstring(parses[tree_number - 1])
                # get NLTK-style indices for all nodes in a preorder
                # traversal of the tree
                positions = tree.treepositions()
                vars_count = 0

                for node_number, key in pairs:
                    # node numbers in records count from one
                    position = positions[node_number - 1]
                    subtree = tree[position]
                    try:
                        subtree.set_label(
                            '{}_VAR_{}'.format(subtree.label(), key))
                    except AttributeError:
                        log.error('skipping variable "{}" because it is a leaf '
                                  'node ({})'.format(subtree, key))
                    else:
                        vars_count += 1

                if vars_count > 1:
                    tagged_parses.append(tree.pformat(margin=99999))

        if tagged_parses:
            tagged_fname = derive_path(lemtree_fname, new_dir=tagged_dir)
            log.info('writing tagged trees to ' + tagged_fname)
            tagged_fname.write_lines(tagged_parses)
예제 #3
0
def get_text(session, core, doi, fields, hash_tags, resume, solr_url,
             text_dir):
    if doi.startswith('http://dx.doi.org/'):
        doi = doi[18:]
    quoted_doi = quote_doi(doi)
    text_path = derive_path('',
                            new_dir=text_dir,
                            new_corename=quoted_doi,
                            new_ext='txt',
                            append_tags=hash_tags)

    if resume and text_path.exists():
        log.info('skipping file {!r} because it exists'.format(text_path))
    else:
        doc = query_solr(session, solr_url, core, doi, fields)
        values = []
        for key in fields:
            try:
                val = doc[key]
            except KeyError as key:
                log.error('no {!r} field for doi {!r} in core {!r}'.format(
                    key.args[0], doi, core))
                return
            # flatten list values, e.g. for fulltext
            if isinstance(val, list):
                values.append('\n'.join(val))
            else:
                values.append(val)
        text = '\n\n'.join(values)
        log.info('creating text file {!r}'.format(text_path))
        text_path.write_text(text)
예제 #4
0
def write_relations(rel_records, rels_dir):
    """
    write extracted relations per file as json records
    """
    rels_dir = Path(rels_dir)
    rels_dir.makedirs_p()

    for fname, rec_list in rel_records.items():
        rels_fname = derive_path(fname, new_dir=rels_dir, append_tags='rels',
                                 new_ext='json')
        log.info('writing extracted relations to ' + rels_fname)
        json.dump(rec_list, rels_fname.open('w'), indent=0)
예제 #5
0
def preproc_vars(trans_exec, trans_fname, in_vars_dir, out_vars_dir,
                 tmp_dir=None, resume=RESUME_PREP):
    """
    Preprocess variables

    Deletes determiners (DT), personal/possessive pronouns (PRP or PRP$) and
    list item markers (LS or LST).
    """
    # TODO: resume only works if tmp_dir is given
    if not tmp_dir:
        tmp = TemporaryDirectory()
        tmp_dir = tmp.name

    parts = [trans_exec, '--tag "#prep"']
    if resume:
        parts.append('--resume')
    parts += [in_vars_dir, tmp_dir, trans_fname]
    cmd = ' '.join(parts)
    log.info('\n' + cmd)
    # universal_newlines=True is passed so the return value will be a string
    # rather than bytes
    ret = check_output(cmd, shell=True, universal_newlines=True)
    log.info('\n' + ret)

    Path(out_vars_dir).makedirs_p()

    for in_vars_fname in Path(tmp_dir).files():
        out_vars_fname = derive_path(in_vars_fname, new_dir=out_vars_dir)

        if resume and out_vars_fname.exists():
            log.info('skipping existing preprocessed file ' + out_vars_fname)
            continue

        records = json.load(open(in_vars_fname))
        # Remove any var that has descendents
        # (i.e. from which a node was deleted)
        # Also remove empty vars or "NP" vars
        out_vars_records = [rec for rec in records
                            if rec['subStr'] not in ['','NP'] and
                            not 'descendants' in rec]
        if out_vars_fname:
            log.info('writing to preprocessed variable file ' + out_vars_fname)
            json.dump(out_vars_records, out_vars_fname.open('w'), indent=0)
        else:
            log.info('skipping empty preprocessed variable file ' +
                     out_vars_fname)
예제 #6
0
def extract_parse_trees(scnlp_files, parse_dir):
    """
    extract parse trees (PTB labeled bracket structures) from Stanford
    CoreNLP XML ouput
    """
    make_dir(parse_dir)

    for scnlp_fname in file_list(scnlp_files, "*.xml"):
        nlp_doc = ElementTree(file=scnlp_fname)

        parse_fname = derive_path(scnlp_fname,
                                  new_dir=parse_dir,
                                  new_ext='.parse')
        log.info("writing " + parse_fname)

        with open(parse_fname, "wt", encoding="utf-8") as parse_file:
            for parse_elem in nlp_doc.findall(".//parse"):
                parse_file.write(parse_elem.text + "\n")
예제 #7
0
def extract_lemmatized_parse_trees(scnlp_files, parse_dir):
    """
    extract lemmatzied parse trees (PTB labeled bracket structures) from
    Stanford CoreNLP XML ouput
    """
    make_dir(parse_dir)

    for scnlp_fname in file_list(scnlp_files, "*.xml"):
        nlp_doc = ElementTree(file=scnlp_fname)

        parse_fname = derive_path(scnlp_fname,
                                  new_dir=parse_dir,
                                  new_ext='.parse')
        log.info("writing " + parse_fname)

        with open(parse_fname, "wt", encoding="utf-8") as parse_file:
            for sentence_elem in nlp_doc.iterfind(".//sentence"):
                lemmas = sentence_elem.iterfind("tokens/token/lemma")
                word_parse = sentence_elem.find("parse").text.strip()
                lemma_parse = " ".join(_lemmatized_node(node, lemmas)
                                       for node in word_parse.split(" "))
                parse_file.write(lemma_parse + "\n")
예제 #8
0
def core_nlp(input,
             out_dir=OUT_DIR,
             annotators=ANNOTATORS,
             class_path=CLASS_PATH,
             version=VERSION,
             memory=MEMORY,
             threads=THREADS,
             replace_ext=REPLACE_EXT,
             output_ext=OUTPUT_EXT,
             options=OPTIONS,
             stamp=STAMP,
             resume=RESUME,
             use_sr_parser=USE_SR_PARSER):
    """
    Run Stanford CoreNLP

    Parameters
    ----------
    input
    out_dir
    annotators
    class_path
    version
    memory
    threads
    replace_ext
    output_ext
    options
    stamp
    resume
    use_sr_parser

    Returns
    -------

    """
    in_files = file_list(input)
    make_dir(out_dir)

    cmd = ['java']

    if memory:
        cmd.append('-Xmx' + memory)

    if class_path:
        class_path = '"{}"'.format(join(class_path or '.', "*"))
        cmd.append('-cp ' + class_path)

    cmd.append('edu.stanford.nlp.pipeline.StanfordCoreNLP')

    if annotators:
        cmd.append('-annotators ' + annotators)

    if stamp:
        replace_ext = True
        output_ext = '#scnlp_v{}{}'.format(version or '', output_ext)

    if replace_ext:
        cmd.append('-replaceExtension')

    if output_ext:
        cmd.append('-outputExtension "{}"'.format(output_ext))

    if out_dir:
        cmd.append('-outputDirectory ' + out_dir)

    if threads:
        cmd.append('-threads {}'.format(threads))

    if resume:
        in_files = [fname for fname in in_files
                    if not derive_path(fname,
                                       new_dir=out_dir,
                                       new_ext=output_ext).exists()]

    if options:
        cmd.append(options)

    if 'parse' in annotators and use_sr_parser:
        cmd.append(
            '-parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz')

    # create a temporary file with input filenames
    tmp_file = NamedTemporaryFile("wt", buffering=1)
    tmp_file.write('\n'.join(in_files) + "\n")

    cmd.append('-filelist ' + tmp_file.name)

    cmd = ' '.join(cmd)
    log.info('\n' + cmd)
    ret = check_output(cmd, shell=True, universal_newlines=True)
    log.info('\n' + ret)

    return ret
예제 #9
0
def vars_to_csv(vars_dir, scnlp_dir, text_dir, nodes_csv_dir,
                relation_csv_dir, max_n=None):
    """
    Transform extracted variables to csv tables that can be imported by neo4j

    Parameters
    ----------
    vars_dir : str
        directory containing files with extracted variables in json format
    scnlp_dir : str
        directory containing files with scnlp output in xml format
    text_dir : str
        directory containing files with original text or sentences (one per line)
    bib_dir : str
        directory containing files with BibTex entries (one per file)
    nodes_csv_dir : str
        output directory for nodes csv files
    relation_csv_dir : str
        output directory for relationships csv files
    max_n: int or None
        process max_n variable files

    Notes
    -----
    See http://neo4j.com/docs/stable/import-tool-header-format.html
    """
    # TODO: change article nodes to document
    # hold on to open files
    open_files = []

    def create_csv_file(dir, csv_fname,
                        header=(':START_ID', ':END_ID', ':TYPE')):
        csv_fname = Path(dir) / csv_fname
        log.info('creating ' + csv_fname)
        f = open(csv_fname, 'w', newline='')
        open_files.append(f)
        csv_file = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        csv_file.writerow(header)
        return csv_file

    Path(nodes_csv_dir).makedirs_p()
    Path(relation_csv_dir).makedirs_p()

    # create csv files for nodes
    articles_csv = create_csv_file(nodes_csv_dir,
                                   'articles.csv',
                                   ('doi:ID',
                                    'filename',
                                    ':LABEL'))

    sentences_csv = create_csv_file(nodes_csv_dir,
                                    'sentences.csv',
                                    ('sentID:ID',
                                     'treeNumber:int',
                                     'charOffsetBegin:int',
                                     'charOffsetEnd:int',
                                     'sentChars',
                                     ':LABEL'))

    variables_csv = create_csv_file(nodes_csv_dir,
                                    'variables.csv',
                                    ('subStr:ID',
                                     ':LABEL'))

    events_csv = create_csv_file(nodes_csv_dir,
                                 'events.csv',
                                 ('eventID:ID',
                                  'filename',
                                  'nodeNumber:int',
                                  'extractName',
                                  'charOffsetBegin:int',
                                  'charOffsetEnd:int',
                                  'direction',
                                  ':LABEL'))

    # create csv files for relations
    has_sent_csv = create_csv_file(relation_csv_dir,
                                   'has_sent.csv')
    has_var_csv = create_csv_file(relation_csv_dir,
                                  'has_var.csv')
    has_event_csv = create_csv_file(relation_csv_dir,
                                    'has_event.csv')
    tentails_var_csv = create_csv_file(relation_csv_dir,
                                       'tentails_var.csv',
                                       (':START_ID',
                                        ':END_ID',
                                        'transformName',
                                        ':TYPE'))

    # set of all variable types in text collection
    variable_types = set()

    # mapping from DOI to text files
    doi2txt = _doi2txt_fname(text_dir)

    for json_fname in Path(vars_dir).files('*.json')[:max_n]:
        records = json.load(open(json_fname))

        if not records:
            log.warning('skipping empty variables file: ' + json_fname)
            continue

        log.info('processing variables from file: ' + json_fname)

        doi = get_doi(json_fname)

        try:
            text_fname = doi2txt[doi]
        except KeyError:
            log.error('no matching text file for DOI ' + doi)
            continue

        text = open(text_fname).read()

        # read corenlp analysis
        tree_fname = records[0]['filename']
        scnlp_fname = derive_path(tree_fname, new_dir=scnlp_dir, new_ext='xml')
        xml_tree = etree.parse(scnlp_fname)
        sentences_elem = xml_tree.getroot()[0][0]

        # create article node
        articles_csv.writerow((doi, text_fname, 'Article'))

        tree_number = None

        # mapping of record's "key" to "subStr" attribute,
        # needed for TENTAILS_VAR relation
        key2var = {}

        for rec in records:
            if rec['treeNumber'] != tree_number:
                # moving to new tree
                tree_number = rec['treeNumber']
                sent_id = '{}/{}'.format(doi, tree_number)
                # get char offsets for sentence (tree numbers start at 1)
                sent_elem = sentences_elem[int(tree_number) - 1]
                begin = int(sent_elem[0][0][2].text)
                end = int(sent_elem[0][-1][3].text)
                sent_chars = text[begin:end]
                sentences_csv.writerow((sent_id,
                                        tree_number,
                                        begin,
                                        end,
                                        sent_chars,
                                        'Sentence'))
                has_sent_csv.writerow((doi,
                                       sent_id,
                                       'HAS_SENT'))

            key2var[rec['key']] = var_type = rec['subStr']

            if var_type not in variable_types:
                variables_csv.writerow((var_type,
                                        'VariableType'))
                variable_types.add(var_type)

            # FIXME weak method of detecting preprocessing
            if ('transformName' in rec and
                    not rec['transformName'].startswith('PreProc')):
                # variable is transformed, but not by preprocessing,
                # so it is tentailed
                ancestor_var_type = key2var[rec['ancestor']]
                tentails_var_csv.writerow((ancestor_var_type,
                                           var_type,
                                           rec['transformName'],
                                           'TENTAILS_VAR'))
            else:
                # observed event
                event_id = rec['key']
                event_labels = 'EventInst;' + rec['label'].capitalize() + 'Inst'
                events_csv.writerow((event_id,
                                     tree_fname,
                                     rec['nodeNumber'],
                                     rec['extractName'],
                                     rec['charOffsetBegin'],
                                     rec['charOffsetEnd'],
                                     rec['label'],
                                     event_labels))

                has_event_csv.writerow((sent_id,
                                        event_id,
                                        'HAS_EVENT'))

                has_var_csv.writerow((event_id,
                                      var_type,
                                      'HAS_VAR'))

    # release opened files
    for f in open_files:
        f.close()