コード例 #1
0
def featurize(supervision_rules, hard_filters):
    for line in sys.stdin:
        row = parser.parse_tsv_row(line)
        for rv in create_supervised_relation(row,
                                             SR=supervision_rules,
                                             HF=hard_filters):
            util.print_tsv_output(rv)
コード例 #2
0
def supervise(supervision_rules, hard_filters, charite_allowed):
  # print >> sys.stderr, supervision_rules
  # generate the mentions, while trying to keep the supervision approx. balanced
  # print out right away so we don't bloat memory...
  pos_count = 0
  neg_count = 0
  # load in static data
  CACHE['example-trees'] = {}
  if charite_allowed:
    CHARITE_PAIRS = read_supervision()
  else:
    CHARITE_PAIRS = []
    
  for line in sys.stdin:
    row = parser.parse_tsv_row(line)

    relation = create_supervised_relation(row, superv_diff=pos_count - neg_count, SR=supervision_rules, HF=hard_filters, 
                                          charite_pairs=CHARITE_PAIRS, charite_allowed=charite_allowed)

    if relation:
      if relation.is_correct == True:
        pos_count += 1
      elif relation.is_correct == False:
        neg_count += 1
      util.print_tsv_output(relation)
コード例 #3
0
            matching_scores = []
            rescores = []
            # for (mt_root1, match_tree1) in match_trees:
            mda = MultiDepAlignment(mt_root1, match_tree1, mt_root2, match_tree2, 2, \
                                    [set(['disease', 'disorder']), \
                                     set(['mutation', 'variant', 'allele', 'polymorphism', \
                                          'SNP', 'truncation', 'deletion', 'duplication']), \
                                     set(['case', 'patient']), \
                                     set(['identify', 'report', 'find', 'detect']), \
                                     set(['cause', 'associate', 'link', 'lead', 'result']),
                                     set(['mutation', 'inhibition', 'deficiency'])])
            # mda.print_matched_lemmas(match_path_file)
            print >> match_path_file, ' '.join(row.words)
            mda.print_match_tree(match_path_file)
            score1 = mda.overall_score()
            score2 = mda.rescore([(set(['cause', 'lead',
                                        'result']), set(['associate',
                                                         'link']), -50),
                                  (set(['mutation']),
                                   set(['inhibition', 'deficiency']), -50)])
            r = read_candidate(row)
            matching_scores.append(int(score1))
            rescores.append(int(score1 + score2))
            # end for
            eutil.print_tsv_output(
                r._replace(matching_scores=matching_scores, rescores=rescores))
        end_time = time.time()
        if lc != 0:
            print >> sys.stderr, "Number of lines: %d, time per line: %f seconds" % (
                lc, (end_time - start_time) / (float(lc)))
コード例 #4
0
        # DOI_TO_PMID = dutil.read_doi_to_pmid()
        PMID_TO_HPO = dutil.load_pmid_to_hpo()
    PHENOS, PHENO_SETS = load_pheno_terms()
    DISEASES, DISEASE_SETS = load_disease_terms()

    # Read TSV data in as Row objects
    for line in sys.stdin:
        row = parser.parse_tsv_row(line)

        # Skip row if sentence doesn't contain a verb, contains URL, etc.
        if util.skip_row(row):
            continue

        # find candidate mentions & supervise
        disease_mentions = extract_candidate_mentions(row, DISEASES,
                                                      DISEASE_SETS)
        pheno_mentions = extract_candidate_mentions(row, PHENOS, PHENO_SETS)
        dwi = [d.wordidxs for d in disease_mentions]
        pheno_mentions_2 = []
        for p in pheno_mentions:
            if p.wordidxs not in dwi:
                pheno_mentions_2.append(p)
        mentions = disease_mentions + pheno_mentions_2

        if SR.get('rand-negs'):
            mentions += generate_rand_negatives(row, mentions)

        # print output
        for mention in mentions:
            util.print_tsv_output(mention)
コード例 #5
0
    rid = '%s_%s' % (row.gene_mention_id, row.pheno_mention_id)
    r = Relation(None, rid, row.doc_id, row.section_id, row.sent_id, \
          row.gene_mention_id, row.gene_name, \
          row.gene_wordidxs, row.gene_is_correct, \
          row.pheno_mention_id, row.pheno_entity, \
          row.pheno_wordidxs, row.pheno_is_correct)

    # Do not consider overlapping mention pairs
    if len(set(r.gene_wordidxs).intersection(r.pheno_wordidxs)) > 0:
        return []

    # Get the min path length between any of the g / p phrase words
    d = dep_dag.path_len_sets(r.gene_wordidxs, r.pheno_wordidxs)
    if d is not None:
        if d > HF['max-dep-path-dist']:
            return []

    return [r]


if __name__ == '__main__':
    for line in sys.stdin:
        row = parser.parse_tsv_row(line)

        # find candidate mentions
        relations = extract_candidate_relations(row)

        # print output
        for relation in relations:
            util.print_tsv_output(relation)
コード例 #6
0
          ('cell_xpos', 'int[]'),
          ('cell_xspans', 'int[]'),
          ('cell_ypos', 'int[]'),
          ('cell_yspans', 'int[]')])

# This defines the output Relation object
Feature = collections.namedtuple('Feature', [
            'table_id',
            'relation_id',
            'feature'])

def get_features(row):
  f = Feature(row.table_id, row.relation_id, None)

  # Form a tablelib Table object
  table = tablelib.Table(row.cell_ids, row.cell_words, row.cell_types, row.cell_attributes, \
            row.cell_xpos, row.cell_xspans, row.cell_ypos, row.cell_yspans)
  
  # Form tablelib CellSapn objects using the table + cell_ids
  gene_cell = tablelib.CellSpan(table.cells[row.gene_cell_id], row.gene_word_idxs)
  pheno_cell = tablelib.CellSpan(table.cells[row.pheno_cell_id], row.pheno_word_idxs)

  # Get the tablelib generic features
  return [f._replace(feature=feature) for feature in tablelib.get_features(table, gene_cell, pheno_cell)]
  
if __name__ == '__main__':
  for line in sys.stdin:
    row = parser.parse_tsv_row(line)
    for f in get_features(row):
      util.print_tsv_output(f)
コード例 #7
0
  # Only consider SAME ROW
  if row.gene_cell_ypos != row.pheno_cell_ypos:
    return None

  # Random negative supervision
  if row.gene_is_correct == False or row.pheno_is_correct == False:
    if random.random() < 0.1 and d > 0:
      return r._replace(type='RAND_NEG', is_correct=False)
    else:
      return None

  # Charite supervision- basic
  for gid in row.gene_entity.split('|'):
    for pid in row.pheno_entity.split('|'):
      if (gid, pid) in gp_dict:
        return r._replace(type='CHARITE_SUP', is_correct=True)
  return r
  
if __name__ == '__main__':
  GP_DICT = dutil.load_gp_supervision()
  d = 0
  for line in sys.stdin:
    row = parser.parse_tsv_row(line)
    relation = supervise_relation(row, GP_DICT, d)
    if relation is not None:
      if relation.is_correct:
        d += 1
      elif relation.is_correct == False:
        d -= 1
      util.print_tsv_output(relation)
コード例 #8
0
          is_correct=None,
          id=None)
    
    # Strip of any leading/trailing non-alphanumeric characters
    # TODO: Do better tokenization early on so this is unnecessary!
    word = re.sub(r'^[^a-z0-9]+|[^a-z0-9]+$', '', word, flags=re.I)

    # Exact matches
    if len(word) > 3 and word in gene_dict:
      mentions.append(
        m._replace(
          entity='|'.join(list(gene_dict[word])),
          type="EXACT_MATCH",
          is_correct=True))

    # Random negatives
    elif random.random() < 0.1 and d > 0:
      d -= 1
      mentions.append(m._replace(type="RAND_NEG", is_correct=False))
  return mentions

if __name__ == '__main__':
  gene_dict = dutil.gene_symbol_to_ensembl_id_map(include_lowercase=False, constrain_to=['CANONICAL_SYMBOL'])
  d = 0
  for line in sys.stdin:
    row = parser.parse_tsv_row(line)
    mentions = extract_candidate_mentions(row, gene_dict, d)
    d += len(filter(lambda m : m.is_correct, mentions)) - len(filter(lambda m : not m.is_correct, mentions))
    for mention in mentions:
      util.print_tsv_output(mention)
コード例 #9
0
      pheno_supertype = row.pheno_supertypes[i]
      if re.findall('RAND_NEG', pheno_supertype) or \
          re.findall('BAD', pheno_supertype) or pheno_supertype == 'O':
        continue
      ners[wordidxs[0]] = 'NERPHENO'
      for wordidx in wordidxs:
        words_ner[wordidx] = 'NERPHENO'
        lemmas_ner[wordidx] = 'nerpheno'
    for i, wordidxs in enumerate(row.gene_wordidxs):
      gene_supertype = row.gene_supertypes[i]
      if gene_supertype == 'BAD_GENE' or gene_supertype == 'MANUAL_BAD' or gene_supertype == 'RAND_WORD_NOT_GENE_SYMBOL' \
          or gene_supertype == 'ABBREVIATION' or gene_supertype == 'ALL_UPPER_NOT_GENE_SYMBOL' or gene_supertype == 'O':
        continue
      ners[wordidxs[0]] = 'NERGENE'
      for wordidx in wordidxs:
        if words_ner[wordidx] != 'NERPHENO':
          words_ner[wordidx] = 'NERGENE'
          lemmas_ner[wordidx] = 'nergene'
    return m._replace(ners='|^|'.join(ners), words_ner='|^|'.join(words_ner), 
                      lemmas_ner='|^|'.join(lemmas_ner))

if __name__ == '__main__':
  # generate the mentions, while trying to keep the supervision approx. balanced
  # print out right away so we don't bloat memory...
  pos_count = 0
  neg_count = 0
  for line in sys.stdin:
    row = parser.parse_tsv_row(line)
    out_row = create_ners(row)
    util.print_tsv_output(out_row)