Example #1
0
    def _decode(mention_json):
        """
        Decode a json string of a sentence.
        e.g.,  {"senid":40,
                "mentions":[{"start":0,"end":2,"labels":["/person"]},
                            {"start":6,"end":8,"labels":["/location/city","/location"]}],
                "tokens":["Raymond","Jung",",","51",",","of","Federal","Way",";",
                         "accused","of","leasing","apartments","where","the","women",
                         "were","housed","."],
                "fileid":""}
        :param mention_json: string
        :return: a sentence instance with all mentions appearing in this sentence
        """
        if mention_json == '':
            return None
        decoded = json.loads(mention_json)
        sentence = Sentence(decoded['fileid'], decoded['senid'], decoded['tokens'])
        for m in decoded['mentions']:
		sentence.add_mention(Mention(int(m['start']), int(m['end']), m['labels']," ".join(decoded['tokens'][m['start']:m['end']])))
        if 'pos' in decoded:
            sentence.pos = decoded['pos']
        if 'dep' in decoded:
            for dep in decoded['dep']:
                sentence.dep.append((dep['type'], dep['gov'], dep['dep']))
        return sentence
Example #2
0
def clean_ner_result(result_file):
    ord_mention_list = list()
    med_mention_list = list()

    fin = open(result_file, 'rb')
    for line in fin:
        line = line.strip()
        if len(line) == 0:
            continue

        vals = line.strip().split('\t')
        # TODO
        if vals[3] == 'Disease' or vals[3] == 'Chemical':
            span = (int(vals[0]), int(vals[1]) - 1)
        else:
            span = (int(vals[0]), int(vals[1]))
        mention = Mention()
        mention.span = span
        mention.mtype = vals[3]
        if len(vals) == 4:
            ord_mention_list.append(mention)
        else:
            if vals[4].startswith('MESH'):
                mention.mesh_id = vals[4][5:]
            elif vals[4].startswith('CHEBI'):
                mention.chebi_id = int(vals[4][6:])
            med_mention_list.append(mention)
    fin.close()

    merged_mention_list = list()
    Mention.merge_mention_list(med_mention_list, merged_mention_list)
    Mention.merge_mention_list(ord_mention_list, merged_mention_list)

    return merged_mention_list
Example #3
0
    def collect_mentions(self):
        mention_ids = defaultdict(list)

        def get_start_ids(cr):
            return [
                int(x.replace(')', '').replace('(', '')) for x in cr.split('|')
                if x.startswith('(')
            ]

        def get_end_ids(cr):
            return [
                int(x.replace(')', '').replace('(', '')) for x in cr.split('|')
                if x.endswith(')')
            ]

        starts = [(i, t) for (i, t) in enumerate(self.tokens)
                  if t.coref.find('(') > -1]
        starts.reverse()
        ends = [(i, t) for (i, t) in enumerate(self.tokens)
                if t.coref.find(')') > -1]

        for s in starts:
            ids = get_start_ids(s[1].coref)
            for i in ids:
                mention_ids[i].append(s)

        for e in ends:
            ids = get_end_ids(e[1].coref)
            for i in ids:
                s = mention_ids[i].pop()
                self.mentions.append(
                    Mention(self.tokens[s[0]:e[0] + 1], self.sentenceID,
                            (s[0], e[0]), i))
Example #4
0
def link():
    print 'beg init'
    med_link = init_model()
    curtext = '“That\'s a growth rate of 6,000 times over three years,” touts Turner.'
    m = Mention(span=(0, 4), mtype='PER')
    mentions = [m]
    lr = med_link.link_mentions(mentions, curtext)
    print __mentions_to_dict_list(lr)
Example #5
0
 def end(self, tag):
     self.tag = ''
     if tag == 'sentences':
         if self.parse_sent:
             self.parse_sent = False
     elif tag == 'sentence':
         if self.parse_sent:
             if self.sent is not None:
                 self.sents.append(deepcopy(self.sent))
                 self.sent = None
     elif tag == 'token':
         # map corenlp ner tags to coerse grained ner tags
         token = Token(self.word,
                       self.lemma,
                       self.pos,
                       ner=convert_corenlp_ner_tag(self.ner))
         self.sent.add_token(deepcopy(token))
         self.word = ''
         self.lemma = ''
         self.pos = ''
         self.ner = ''
     elif tag == 'dependencies':
         if self.parse_dep:
             self.parse_dep = False
     elif tag == 'dep':
         if self.parse_dep:
             if not self.copied_dep:
                 if self.dep_label != 'root':
                     dep = Dependency(self.dep_label, self.gov_idx,
                                      self.dep_idx, self.extra)
                     self.sent.add_dep(deepcopy(dep))
             else:
                 self.copied_dep = False
             self.dep_label = ''
             self.gov_idx = -1
             self.dep_idx = -1
             self.extra = False
     elif tag == 'coreference':
         if self.parse_coref:
             if self.coref is not None:
                 self.corefs.append(deepcopy(self.coref))
                 self.coref = None
             else:
                 self.parse_coref = False
     elif tag == 'mention':
         mention = Mention(self.sent_idx,
                           self.start_token_idx,
                           self.end_token_idx,
                           head_token_idx=self.head_token_idx,
                           rep=self.rep,
                           text=self.text.encode('ascii', 'ignore'))
         self.coref.add_mention(deepcopy(mention))
         self.sent_idx = -1
         self.start_token_idx = -1
         self.end_token_idx = -1
         self.head_token_idx = -1
         self.rep = False
         self.text = ''
Example #6
0
def get_cand_mentions(corpus, limit=5, check=False):
    """
    :param corpus: 1D: n_doc, 2D: n_sents, 3D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref_id)
    :return: cand: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=Mention
    """
    cand_ments = []
    count = 0.
    max_span_len = -1
    total_span_len = 0.

    for doc_i, doc in enumerate(corpus):
        doc_ments = []

        for sent_i, sent in enumerate(doc):
            mention_spans = []
            """ Extracting NP, Pro-Nom, NE mentions """
            mention_spans.extend(get_np(sent))
            mention_spans.extend(get_pronominals(sent))
            mention_spans.extend(get_ne(sent))
            """ Removing duplicates, and sorting """
            mention_spans = list(set(mention_spans))
            mention_spans.sort()

            tmp_ments = []
            for span in mention_spans:
                span_len = span[1] - span[0] + 1

                if span_len <= limit:
                    tmp_ments.append(Mention(doc_i, sent_i, span))

                    if span_len > max_span_len:
                        max_span_len = span_len
                    total_span_len += span_len

            doc_ments.append(tmp_ments)
            count += len(tmp_ments)

        cand_ments.append(doc_ments)

    print 'Cand Mentions: %d  Max Span Length: %d  Avg. Span Length: %f' % (
        count, max_span_len, total_span_len / count)

    if check:
        with open('cand_mentions.txt', 'w') as f:
            for doc, doc_ments in zip(corpus, cand_ments):
                for sent, sent_ments in zip(doc, doc_ments):
                    for ment in sent_ments:
                        print >> f, '%s' % str(ment.span)
                    print >> f

                    for sent_i, w in enumerate(sent):
                        print >> f, '%d\t%s\t%s' % (sent_i,
                                                    w[2].encode('utf-8'),
                                                    w[-1].encode('utf-8'))
                    print >> f

    return cand_ments
Example #7
0
 def __find_mesh_mentions(self, text):
     mesh_spans, mesh_ids = self.mesh_match.find_all_terms(text)
     mention_list = list()
     for mesh_span, mesh_id in izip(mesh_spans, mesh_ids):
         mention = Mention()
         mention.span = mesh_span
         mention.mtype = 'MISC'
         mention.mesh_id = mesh_id
         mention_list.append(mention)
     return mention_list
Example #8
0
def get_gold_ments(doc_i, sent_i, sent):
    """
    :param sent: 1D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref)
    :return: ments: 1D: n_mentions: elem=Mention
    """

    ments = []
    prev = []

    for i, w in enumerate(sent):
        mentions = w[6].split('|')

        for mention in mentions:
            if mention.startswith('('):
                if mention.endswith(')'):
                    span = (i, i)
                    coref_id = int(mention[1:-1])
                    ments.append(Mention(doc_i, sent_i, span, coref_id))
                else:
                    coref_id = int(mention[1:])
                    prev.append(((i, i), coref_id))
            else:
                if mention.endswith(')'):
                    coref_id = int(mention[:-1])

                    for j, p in enumerate(prev):
                        if coref_id == p[1]:
                            span = (p[0][0], i)
                            ments.append(Mention(doc_i, sent_i, span,
                                                 coref_id))
                            prev.pop(j)
                            break
                    else:
                        print 'Error at extract_mentions(): %s' % str(sent)
                        exit()

    assert len(prev) == 0
    return ments
Example #9
0
def edl_api():
    doc_text = ''
    if 'text' in request.values:
        doc_text = request.values['text']
        # print doc_text
        # print type(doc_text)
    else:
        abort(400)

    json_result = '[]'
    try:
        mentions_list = list()
        mentions_dict = mention_extraction_web(doc_text)
        for result_type, mentions in mentions_dict.items():
            entity_type = 'MISC'
            if result_type == 'results_Disease':
                entity_type = 'Disease'
            elif result_type == 'results_Chemical':
                entity_type = 'Chemical'

            for dict_mention in mentions:
                beg_pos = dict_mention['startChar']
                end_pos = dict_mention['endChar']
                meshid = None
                specified_type = dict_mention.get('label', None)
                if specified_type:
                    entity_type = specified_type
                # print dict_mention
                # print beg_pos, end_pos, entity_type, meshid
                m = Mention(span=(beg_pos, end_pos),
                            mtype=entity_type,
                            mesh_id=meshid)
                mentions_list.append(m)
        # linked_mentions = med_link.link_mentions(mentions_list, doc_text.decode('utf-8'))
        linked_mentions = med_link.link_mentions(mentions_list, doc_text)
        json_result = json.dumps(__mentions_to_dict_list(linked_mentions))
    except:
        print 'except'
    print json_result + '\n'
    return json_result
Example #10
0
 def produce_mention(self, serif_doc, serif_mention):
     mention = Mention(serif_mention.entity_type, serif_mention.mention_type.value, serif_mention.text, serif_mention.head.text, serif_doc.docid, serif_mention.syn_node.start_char, serif_mention.syn_node.end_char, serif_mention.head.start_char, serif_mention.head.end_char, serif_mention.sent_no)
     return mention