예제 #1
0
def _doi2txt_fname(text_dir):
    """
    Create a dict mapping DOI to path of input text file
    """
    doi2txt = {}

    for p in Path(text_dir).files():
        doi = get_doi(p)
        if doi in doi2txt:
            log.error('DOI {} already mapped to text file {}; '
                      'ignoring text file {}'.format(doi, doi2txt[doi], p))
        else:
            doi2txt[doi] = p

    return doi2txt
예제 #2
0
def vars_to_csv(vars_dir, scnlp_dir, text_dir, nodes_csv_dir,
                relation_csv_dir, max_n=None):
    """
    Transform extracted variables to csv tables that can be imported by neo4j

    Parameters
    ----------
    vars_dir : str
        directory containing files with extracted variables in json format
    scnlp_dir : str
        directory containing files with scnlp output in xml format
    text_dir : str
        directory containing files with original text or sentences (one per line)
    bib_dir : str
        directory containing files with BibTex entries (one per file)
    nodes_csv_dir : str
        output directory for nodes csv files
    relation_csv_dir : str
        output directory for relationships csv files
    max_n: int or None
        process max_n variable files

    Notes
    -----
    See http://neo4j.com/docs/stable/import-tool-header-format.html
    """
    # TODO: change article nodes to document
    # hold on to open files
    open_files = []

    def create_csv_file(dir, csv_fname,
                        header=(':START_ID', ':END_ID', ':TYPE')):
        csv_fname = Path(dir) / csv_fname
        log.info('creating ' + csv_fname)
        f = open(csv_fname, 'w', newline='')
        open_files.append(f)
        csv_file = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        csv_file.writerow(header)
        return csv_file

    Path(nodes_csv_dir).makedirs_p()
    Path(relation_csv_dir).makedirs_p()

    # create csv files for nodes
    articles_csv = create_csv_file(nodes_csv_dir,
                                   'articles.csv',
                                   ('doi:ID',
                                    'filename',
                                    ':LABEL'))

    sentences_csv = create_csv_file(nodes_csv_dir,
                                    'sentences.csv',
                                    ('sentID:ID',
                                     'treeNumber:int',
                                     'charOffsetBegin:int',
                                     'charOffsetEnd:int',
                                     'sentChars',
                                     ':LABEL'))

    variables_csv = create_csv_file(nodes_csv_dir,
                                    'variables.csv',
                                    ('subStr:ID',
                                     ':LABEL'))

    events_csv = create_csv_file(nodes_csv_dir,
                                 'events.csv',
                                 ('eventID:ID',
                                  'filename',
                                  'nodeNumber:int',
                                  'extractName',
                                  'charOffsetBegin:int',
                                  'charOffsetEnd:int',
                                  'direction',
                                  ':LABEL'))

    # create csv files for relations
    has_sent_csv = create_csv_file(relation_csv_dir,
                                   'has_sent.csv')
    has_var_csv = create_csv_file(relation_csv_dir,
                                  'has_var.csv')
    has_event_csv = create_csv_file(relation_csv_dir,
                                    'has_event.csv')
    tentails_var_csv = create_csv_file(relation_csv_dir,
                                       'tentails_var.csv',
                                       (':START_ID',
                                        ':END_ID',
                                        'transformName',
                                        ':TYPE'))

    # set of all variable types in text collection
    variable_types = set()

    # mapping from DOI to text files
    doi2txt = _doi2txt_fname(text_dir)

    for json_fname in Path(vars_dir).files('*.json')[:max_n]:
        records = json.load(open(json_fname))

        if not records:
            log.warning('skipping empty variables file: ' + json_fname)
            continue

        log.info('processing variables from file: ' + json_fname)

        doi = get_doi(json_fname)

        try:
            text_fname = doi2txt[doi]
        except KeyError:
            log.error('no matching text file for DOI ' + doi)
            continue

        text = open(text_fname).read()

        # read corenlp analysis
        tree_fname = records[0]['filename']
        scnlp_fname = derive_path(tree_fname, new_dir=scnlp_dir, new_ext='xml')
        xml_tree = etree.parse(scnlp_fname)
        sentences_elem = xml_tree.getroot()[0][0]

        # create article node
        articles_csv.writerow((doi, text_fname, 'Article'))

        tree_number = None

        # mapping of record's "key" to "subStr" attribute,
        # needed for TENTAILS_VAR relation
        key2var = {}

        for rec in records:
            if rec['treeNumber'] != tree_number:
                # moving to new tree
                tree_number = rec['treeNumber']
                sent_id = '{}/{}'.format(doi, tree_number)
                # get char offsets for sentence (tree numbers start at 1)
                sent_elem = sentences_elem[int(tree_number) - 1]
                begin = int(sent_elem[0][0][2].text)
                end = int(sent_elem[0][-1][3].text)
                sent_chars = text[begin:end]
                sentences_csv.writerow((sent_id,
                                        tree_number,
                                        begin,
                                        end,
                                        sent_chars,
                                        'Sentence'))
                has_sent_csv.writerow((doi,
                                       sent_id,
                                       'HAS_SENT'))

            key2var[rec['key']] = var_type = rec['subStr']

            if var_type not in variable_types:
                variables_csv.writerow((var_type,
                                        'VariableType'))
                variable_types.add(var_type)

            # FIXME weak method of detecting preprocessing
            if ('transformName' in rec and
                    not rec['transformName'].startswith('PreProc')):
                # variable is transformed, but not by preprocessing,
                # so it is tentailed
                ancestor_var_type = key2var[rec['ancestor']]
                tentails_var_csv.writerow((ancestor_var_type,
                                           var_type,
                                           rec['transformName'],
                                           'TENTAILS_VAR'))
            else:
                # observed event
                event_id = rec['key']
                event_labels = 'EventInst;' + rec['label'].capitalize() + 'Inst'
                events_csv.writerow((event_id,
                                     tree_fname,
                                     rec['nodeNumber'],
                                     rec['extractName'],
                                     rec['charOffsetBegin'],
                                     rec['charOffsetEnd'],
                                     rec['label'],
                                     event_labels))

                has_event_csv.writerow((sent_id,
                                        event_id,
                                        'HAS_EVENT'))

                has_var_csv.writerow((event_id,
                                      var_type,
                                      'HAS_VAR'))

    # release opened files
    for f in open_files:
        f.close()