Python RowParserの例、extractor_util.RowParser Pythonの例

コード例 #1

0

ファイルを表示

import collections
import extractor_util as eutil
import sys
from dep_alignment.alignment_util import row_to_canonical_match_tree, DepParentsCycleException, OverlappingCandidatesException, RootException
from dep_alignment.multi_dep_alignment import MultiDepAlignment
import os
import random
import time

# This defines the Row object that we read in to the extractor
parser = eutil.RowParser([('relation_id', 'text'), ('doc_id', 'text'),
                          ('section_id', 'text'), ('sent_id', 'int'),
                          ('gene_mention_id', 'text'), ('gene_name', 'text'),
                          ('gene_wordidxs', 'int[]'),
                          ('gene_is_correct', 'boolean'),
                          ('pheno_mention_id', 'text'),
                          ('pheno_entity', 'text'),
                          ('pheno_wordidxs', 'int[]'),
                          ('pheno_is_correct', 'boolean'), ('words', 'text[]'),
                          ('lemmas', 'text[]'), ('poses', 'text[]'),
                          ('dep_paths', 'text[]'), ('dep_parents', 'int[]'),
                          ('ners', 'text')])

ds_parser = eutil.RowParser([('words', 'text[]'), ('lemmas', 'text[]'),
                             ('poses', 'text[]'), ('dep_paths', 'text[]'),
                             ('dep_parents', 'int[]'),
                             ('gene_wordidxs', 'int[]'),
                             ('pheno_wordidxs', 'int[]')])

# This defines the output Relation object
Relation = collections.namedtuple('Relation', [
    'dd_id', 'relation_id', 'doc_id', 'section_id', 'sent_id',

コード例 #2

0

ファイルを表示

#!/usr/bin/env python
from collections import namedtuple
import extractor_util as util
import ddlib
import re

# This defines the Row object that we read in to the extractor
parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'),
                         ('sent_id', 'int'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('ners', 'text[]'), ('dep_paths', 'text[]'),
                         ('dep_parents', 'int[]'), ('mention_id', 'text'),
                         ('mention_type', 'text'),
                         ('mention_wordidxs', 'int[]')])

Feature = namedtuple('Feature', ['doc_id', 'section_id', 'mention_id', 'name'])

ENSEMBL_TYPES = ['NONCANONICAL', 'CANONICAL', 'REFSEQ']


def get_custom_features(row):
    gene_word = row.words[row.mention_wordidxs[0]]
    if re.match('^[ATGCN]{1,5}$', gene_word):
        yield 'GENE_ONLY_BASES'


def get_features_for_row(row):
    #OPTS = config.GENE['F']
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,

コード例 #3

0

ファイルを表示

ファイル: pheno_acronyms_to_mentions.py プロジェクト: HazyResearch/dd-genomics

from collections import defaultdict, namedtuple
import sys
import re
import os
import random
from itertools import chain
import extractor_util as util
import data_util as dutil
import config

# This defines the Row object that we read in to the extractor
parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'),
                         ('sent_id', 'int'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('ners', 'text[]'), ('pa_abbrevs', 'text[]'),
                         ('pheno_entities', 'text[]'),
                         ('pa_section_ids', 'text[]'),
                         ('pa_sent_ids', 'int[]')])

ExpandedRow = namedtuple('ExpandedRow', [
    'doc_id', 'section_id', 'sent_id', 'words', 'lemmas', 'poses', 'ners',
    'pa_abbrev', 'pheno_entity', 'pa_section_id', 'pa_sent_id'
])

# This defines the output Mention object
Mention = namedtuple('Mention', [
    'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id',
    'mention_supertype', 'mention_subtype', 'entity', 'words', 'is_correct'
])

コード例 #4

0

ファイルを表示

#!/usr/bin/env python
from collections import defaultdict, namedtuple
import sys
import re
import os
import random
from itertools import chain
import extractor_util as util
import data_util as dutil
import config

onto_path = lambda p: '%s/onto/%s' % (os.environ['GDD_HOME'], p)

# This defines the Row object that we read in to the extractor
parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'),
                         ('sent_id', 'int'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('ners', 'text[]')])

# This defines the output Mention object
Mention = namedtuple('Mention', [
    'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id',
    'mention_supertype', 'mention_subtype', 'entity', 'words', 'is_correct'
])

### CANDIDATE EXTRACTION ###
HF = config.PHENO['HF']
SR = config.PHENO['SR']


def enrich_phenos(rows):
    ret = []

コード例 #5

0

ファイルを表示

import abbreviations
import config
import extractor_util as util
import levenshtein


CACHE = dict()  # Cache results of disk I/O


# This defines the Row object that we read in to the extractor
parser = util.RowParser([
          ('doc_id', 'text'),
          ('section_id', 'text'),
          ('sent_id', 'int'),
          ('words', 'text[]'),
          ('dep_paths', 'text[]'),
          ('dep_parents', 'int[]'),
          ('lemmas', 'text[]'),
          ('poses', 'text[]'),
          ('ners', 'text[]'),
          ('gene_wordidx_array', 'int[]')])


# This defines the output Mention object
Mention = collections.namedtuple('Mention', [
            'dd_id',
            'doc_id',
            'section_id',
            'sent_id',
            'short_wordidxs',
            'long_wordidxs',

コード例 #6

0

ファイルを表示

import abbreviations
import config
import extractor_util as util
import levenshtein
import data_util as dutil

CACHE = dict()  # Cache results of disk I/O

# This defines the Row object that we read in to the extractor
parser = util.RowParser([
    ('doc_id', 'text'),
    ('section_id', 'text'),
    ('sent_id', 'int'),
    ('wordidxs', 'int[]'),
    ('mention_ids', 'text[]'),
    ('supertypes', 'text[]'),
    ('subtypes', 'text[]'),
    ('entities', 'text[]'),
    ('words', 'text[]'),
    ('is_corrects', 'boolean[]'),
])

# This defines the output Mention object
Mention = collections.namedtuple('Mention', [
    'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id',
    'supertype', 'subtype', 'entity', 'words', 'is_correct'
])

hpo_dag = dutil.read_hpo_dag()

コード例 #7

0

ファイルを表示

#! /usr/bin/env python

import dep_util
import extractor_util as util
import sys

# This defines the Row object that we read in to the extractor
parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'),
                         ('sent_id', 'text'), ('dep_parents', 'int[]'),
                         ('dep_paths', 'text[]'), ('words', 'text[]')])

if __name__ == "__main__":
    for line in sys.stdin:
        row = parser.parse_tsv_row(line)
        dpd = dep_util.DepPathDAG(row.dep_parents, row.dep_paths, row.words)
        for i in xrange(0, len(row.words)):
            sys.stderr.write(
                str((i, row.words[i], dpd.neighbors(i),
                     [row.words[i] for i in dpd.neighbors(i)])) + '\n')

コード例 #8

0

ファイルを表示

ファイル: genevariant_extract_candidates.py プロジェクト: NunoEdgarGFlowHub/dd-genomics

import config


# This defines the Row object that we read in to the extractor
parser = util.RowParser([
          ('doc_id', 'text'),
          ('gene_section_id', 'text'),
          ('gene_sent_id', 'int'),
          ('variant_section_id', 'text'),
          ('variant_sent_id', 'int'),
          ('gene_words', 'text[]'),
          ('gene_lemmas', 'text[]'),
          ('gene_poses', 'text[]'),
          ('gene_dep_paths', 'text[]'),
          ('gene_dep_parents', 'int[]'),
          ('variant_words', 'text[]'),
          ('variant_lemmas', 'text[]'),
          ('variant_poses', 'text[]'),
          ('variant_dep_paths', 'text[]'),
          ('variant_dep_parents', 'int[]'),
          ('gene_mention_ids', 'text[]'),
          ('gene_names', 'text[]'),
          ('gene_wordidxs', 'int[][]'),
          ('gene_is_corrects', 'boolean[]'),
          ('variant_mention_ids', 'text[]'),
          ('variant_entities', 'text[]'),
          ('variant_wordidxs', 'int[][]'),
          ('variant_is_corrects', 'boolean[]')])


# This defines the output Relation object
Relation = collections.namedtuple('Relation', [

コード例 #9

0

ファイルを表示

#!/usr/bin/env python
import extractor_util as util
from collections import namedtuple
import os
import sys
import ddlib

parser = util.RowParser([('relation_id', 'text'), ('doc_id', 'text'),
                         ('section_id', 'text'), ('sent_id', 'int'),
                         ('genevar_mention_id', 'text'),
                         ('genevar_wordidxs', 'int[]'),
                         ('pheno_mention_id', 'text'),
                         ('pheno_wordidxs', 'int[]'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('ners', 'text[]'), ('dep_paths', 'text[]'),
                         ('dep_parents', 'int[]')])

Feature = namedtuple('Feature',
                     ['doc_id', 'section_id', 'relation_id', 'name'])


def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib

コード例 #10

0

ファイルを表示

ファイル: variantpheno_extract_candidates.py プロジェクト: NunoEdgarGFlowHub/dd-genomics

import extractor_util as util
import data_util as dutil
import dep_util as deps
import os
import random
import re
import sys
import config

# This defines the Row object that we read in to the extractor
parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'),
                         ('sent_id', 'int'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('dep_paths', 'text[]'), ('dep_parents', 'int[]'),
                         ('genevar_mention_ids', 'text[]'),
                         ('genevar_entities', 'text[]'),
                         ('genevar_wordidxs', 'int[][]'),
                         ('genevar_is_corrects', 'boolean[]'),
                         ('pheno_mention_ids', 'text[]'),
                         ('pheno_entities', 'text[]'),
                         ('pheno_wordidxs', 'int[][]'),
                         ('pheno_is_corrects', 'boolean[]')])

# This defines the output Relation object
Relation = collections.namedtuple('Relation', [
    'dd_id', 'relation_id', 'doc_id', 'section_id', 'sent_id',
    'genevar_mention_id', 'genevar_entity', 'genevar_wordidxs',
    'genevar_is_correct', 'pheno_mention_id', 'pheno_entity', 'pheno_wordidxs',
    'pheno_is_correct', 'is_correct', 'supertype', 'subtype'
])

### CANDIDATE EXTRACTION ###

コード例 #11

0

ファイルを表示

ファイル: sentences_input_ner_extraction.py プロジェクト: NunoEdgarGFlowHub/dd-genomics

import collections
import extractor_util as util
import re
import sys

CACHE = dict()  # Cache results of disk I/O


# This defines the Row object that we read in to the extractor
parser = util.RowParser([
          ('doc_id', 'text'),
          ('section_id', 'text'),
          ('sent_id', 'int'),
          ('words', 'text[]'),
          ('lemmas', 'text[]'),
          ('poses', 'text'),
          ('dep_paths', 'text'),
          ('dep_parents', 'text'),
          ('gene_wordidxs', 'int[][]'),
          ('gene_supertypes', 'text[]'),
          ('pheno_wordidxs', 'int[][]'),
          ('pheno_supertypes', 'text[]')])


# This defines the output Mention object
Mention = collections.namedtuple('Mention', [
            'doc_id',
            'section_id',
            'sent_id',
            'words',
            'words_ner',