import collections import extractor_util as eutil import sys from dep_alignment.alignment_util import row_to_canonical_match_tree, DepParentsCycleException, OverlappingCandidatesException, RootException from dep_alignment.multi_dep_alignment import MultiDepAlignment import os import random import time # This defines the Row object that we read in to the extractor parser = eutil.RowParser([('relation_id', 'text'), ('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('gene_mention_id', 'text'), ('gene_name', 'text'), ('gene_wordidxs', 'int[]'), ('gene_is_correct', 'boolean'), ('pheno_mention_id', 'text'), ('pheno_entity', 'text'), ('pheno_wordidxs', 'int[]'), ('pheno_is_correct', 'boolean'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]'), ('ners', 'text')]) ds_parser = eutil.RowParser([('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]'), ('gene_wordidxs', 'int[]'), ('pheno_wordidxs', 'int[]')]) # This defines the output Relation object Relation = collections.namedtuple('Relation', [ 'dd_id', 'relation_id', 'doc_id', 'section_id', 'sent_id',
#!/usr/bin/env python from collections import namedtuple import extractor_util as util import ddlib import re # This defines the Row object that we read in to the extractor parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]'), ('mention_id', 'text'), ('mention_type', 'text'), ('mention_wordidxs', 'int[]')]) Feature = namedtuple('Feature', ['doc_id', 'section_id', 'mention_id', 'name']) ENSEMBL_TYPES = ['NONCANONICAL', 'CANONICAL', 'REFSEQ'] def get_custom_features(row): gene_word = row.words[row.mention_wordidxs[0]] if re.match('^[ATGCN]{1,5}$', gene_word): yield 'GENE_ONLY_BASES' def get_features_for_row(row): #OPTS = config.GENE['F'] features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id,
from collections import defaultdict, namedtuple import sys import re import os import random from itertools import chain import extractor_util as util import data_util as dutil import config # This defines the Row object that we read in to the extractor parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]'), ('pa_abbrevs', 'text[]'), ('pheno_entities', 'text[]'), ('pa_section_ids', 'text[]'), ('pa_sent_ids', 'int[]')]) ExpandedRow = namedtuple('ExpandedRow', [ 'doc_id', 'section_id', 'sent_id', 'words', 'lemmas', 'poses', 'ners', 'pa_abbrev', 'pheno_entity', 'pa_section_id', 'pa_sent_id' ]) # This defines the output Mention object Mention = namedtuple('Mention', [ 'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id', 'mention_supertype', 'mention_subtype', 'entity', 'words', 'is_correct' ])
#!/usr/bin/env python from collections import defaultdict, namedtuple import sys import re import os import random from itertools import chain import extractor_util as util import data_util as dutil import config onto_path = lambda p: '%s/onto/%s' % (os.environ['GDD_HOME'], p) # This defines the Row object that we read in to the extractor parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]')]) # This defines the output Mention object Mention = namedtuple('Mention', [ 'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id', 'mention_supertype', 'mention_subtype', 'entity', 'words', 'is_correct' ]) ### CANDIDATE EXTRACTION ### HF = config.PHENO['HF'] SR = config.PHENO['SR'] def enrich_phenos(rows): ret = []
import abbreviations import config import extractor_util as util import levenshtein CACHE = dict() # Cache results of disk I/O # This defines the Row object that we read in to the extractor parser = util.RowParser([ ('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]'), ('gene_wordidx_array', 'int[]')]) # This defines the output Mention object Mention = collections.namedtuple('Mention', [ 'dd_id', 'doc_id', 'section_id', 'sent_id', 'short_wordidxs', 'long_wordidxs',
import abbreviations import config import extractor_util as util import levenshtein import data_util as dutil CACHE = dict() # Cache results of disk I/O # This defines the Row object that we read in to the extractor parser = util.RowParser([ ('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('wordidxs', 'int[]'), ('mention_ids', 'text[]'), ('supertypes', 'text[]'), ('subtypes', 'text[]'), ('entities', 'text[]'), ('words', 'text[]'), ('is_corrects', 'boolean[]'), ]) # This defines the output Mention object Mention = collections.namedtuple('Mention', [ 'dd_id', 'doc_id', 'section_id', 'sent_id', 'wordidxs', 'mention_id', 'supertype', 'subtype', 'entity', 'words', 'is_correct' ]) hpo_dag = dutil.read_hpo_dag()
#! /usr/bin/env python import dep_util import extractor_util as util import sys # This defines the Row object that we read in to the extractor parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'text'), ('dep_parents', 'int[]'), ('dep_paths', 'text[]'), ('words', 'text[]')]) if __name__ == "__main__": for line in sys.stdin: row = parser.parse_tsv_row(line) dpd = dep_util.DepPathDAG(row.dep_parents, row.dep_paths, row.words) for i in xrange(0, len(row.words)): sys.stderr.write( str((i, row.words[i], dpd.neighbors(i), [row.words[i] for i in dpd.neighbors(i)])) + '\n')
import config # This defines the Row object that we read in to the extractor parser = util.RowParser([ ('doc_id', 'text'), ('gene_section_id', 'text'), ('gene_sent_id', 'int'), ('variant_section_id', 'text'), ('variant_sent_id', 'int'), ('gene_words', 'text[]'), ('gene_lemmas', 'text[]'), ('gene_poses', 'text[]'), ('gene_dep_paths', 'text[]'), ('gene_dep_parents', 'int[]'), ('variant_words', 'text[]'), ('variant_lemmas', 'text[]'), ('variant_poses', 'text[]'), ('variant_dep_paths', 'text[]'), ('variant_dep_parents', 'int[]'), ('gene_mention_ids', 'text[]'), ('gene_names', 'text[]'), ('gene_wordidxs', 'int[][]'), ('gene_is_corrects', 'boolean[]'), ('variant_mention_ids', 'text[]'), ('variant_entities', 'text[]'), ('variant_wordidxs', 'int[][]'), ('variant_is_corrects', 'boolean[]')]) # This defines the output Relation object Relation = collections.namedtuple('Relation', [
#!/usr/bin/env python import extractor_util as util from collections import namedtuple import os import sys import ddlib parser = util.RowParser([('relation_id', 'text'), ('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('genevar_mention_id', 'text'), ('genevar_wordidxs', 'int[]'), ('pheno_mention_id', 'text'), ('pheno_wordidxs', 'int[]'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]')]) Feature = namedtuple('Feature', ['doc_id', 'section_id', 'relation_id', 'name']) def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib
import extractor_util as util import data_util as dutil import dep_util as deps import os import random import re import sys import config # This defines the Row object that we read in to the extractor parser = util.RowParser([('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]'), ('genevar_mention_ids', 'text[]'), ('genevar_entities', 'text[]'), ('genevar_wordidxs', 'int[][]'), ('genevar_is_corrects', 'boolean[]'), ('pheno_mention_ids', 'text[]'), ('pheno_entities', 'text[]'), ('pheno_wordidxs', 'int[][]'), ('pheno_is_corrects', 'boolean[]')]) # This defines the output Relation object Relation = collections.namedtuple('Relation', [ 'dd_id', 'relation_id', 'doc_id', 'section_id', 'sent_id', 'genevar_mention_id', 'genevar_entity', 'genevar_wordidxs', 'genevar_is_correct', 'pheno_mention_id', 'pheno_entity', 'pheno_wordidxs', 'pheno_is_correct', 'is_correct', 'supertype', 'subtype' ]) ### CANDIDATE EXTRACTION ###
import collections import extractor_util as util import re import sys CACHE = dict() # Cache results of disk I/O # This defines the Row object that we read in to the extractor parser = util.RowParser([ ('doc_id', 'text'), ('section_id', 'text'), ('sent_id', 'int'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text'), ('dep_paths', 'text'), ('dep_parents', 'text'), ('gene_wordidxs', 'int[][]'), ('gene_supertypes', 'text[]'), ('pheno_wordidxs', 'int[][]'), ('pheno_supertypes', 'text[]')]) # This defines the output Mention object Mention = collections.namedtuple('Mention', [ 'doc_id', 'section_id', 'sent_id', 'words', 'words_ner',