def _decode(mention_json): """ Decode a json string of a sentence. e.g., {"senid":40, "mentions":[{"start":0,"end":2,"labels":["/person"]}, {"start":6,"end":8,"labels":["/location/city","/location"]}], "tokens":["Raymond","Jung",",","51",",","of","Federal","Way",";", "accused","of","leasing","apartments","where","the","women", "were","housed","."], "fileid":""} :param mention_json: string :return: a sentence instance with all mentions appearing in this sentence """ if mention_json == '': return None decoded = json.loads(mention_json) sentence = Sentence(decoded['fileid'], decoded['senid'], decoded['tokens']) for m in decoded['mentions']: sentence.add_mention(Mention(int(m['start']), int(m['end']), m['labels']," ".join(decoded['tokens'][m['start']:m['end']]))) if 'pos' in decoded: sentence.pos = decoded['pos'] if 'dep' in decoded: for dep in decoded['dep']: sentence.dep.append((dep['type'], dep['gov'], dep['dep'])) return sentence
def clean_ner_result(result_file): ord_mention_list = list() med_mention_list = list() fin = open(result_file, 'rb') for line in fin: line = line.strip() if len(line) == 0: continue vals = line.strip().split('\t') # TODO if vals[3] == 'Disease' or vals[3] == 'Chemical': span = (int(vals[0]), int(vals[1]) - 1) else: span = (int(vals[0]), int(vals[1])) mention = Mention() mention.span = span mention.mtype = vals[3] if len(vals) == 4: ord_mention_list.append(mention) else: if vals[4].startswith('MESH'): mention.mesh_id = vals[4][5:] elif vals[4].startswith('CHEBI'): mention.chebi_id = int(vals[4][6:]) med_mention_list.append(mention) fin.close() merged_mention_list = list() Mention.merge_mention_list(med_mention_list, merged_mention_list) Mention.merge_mention_list(ord_mention_list, merged_mention_list) return merged_mention_list
def collect_mentions(self): mention_ids = defaultdict(list) def get_start_ids(cr): return [ int(x.replace(')', '').replace('(', '')) for x in cr.split('|') if x.startswith('(') ] def get_end_ids(cr): return [ int(x.replace(')', '').replace('(', '')) for x in cr.split('|') if x.endswith(')') ] starts = [(i, t) for (i, t) in enumerate(self.tokens) if t.coref.find('(') > -1] starts.reverse() ends = [(i, t) for (i, t) in enumerate(self.tokens) if t.coref.find(')') > -1] for s in starts: ids = get_start_ids(s[1].coref) for i in ids: mention_ids[i].append(s) for e in ends: ids = get_end_ids(e[1].coref) for i in ids: s = mention_ids[i].pop() self.mentions.append( Mention(self.tokens[s[0]:e[0] + 1], self.sentenceID, (s[0], e[0]), i))
def link(): print 'beg init' med_link = init_model() curtext = '“That\'s a growth rate of 6,000 times over three years,” touts Turner.' m = Mention(span=(0, 4), mtype='PER') mentions = [m] lr = med_link.link_mentions(mentions, curtext) print __mentions_to_dict_list(lr)
def end(self, tag): self.tag = '' if tag == 'sentences': if self.parse_sent: self.parse_sent = False elif tag == 'sentence': if self.parse_sent: if self.sent is not None: self.sents.append(deepcopy(self.sent)) self.sent = None elif tag == 'token': # map corenlp ner tags to coerse grained ner tags token = Token(self.word, self.lemma, self.pos, ner=convert_corenlp_ner_tag(self.ner)) self.sent.add_token(deepcopy(token)) self.word = '' self.lemma = '' self.pos = '' self.ner = '' elif tag == 'dependencies': if self.parse_dep: self.parse_dep = False elif tag == 'dep': if self.parse_dep: if not self.copied_dep: if self.dep_label != 'root': dep = Dependency(self.dep_label, self.gov_idx, self.dep_idx, self.extra) self.sent.add_dep(deepcopy(dep)) else: self.copied_dep = False self.dep_label = '' self.gov_idx = -1 self.dep_idx = -1 self.extra = False elif tag == 'coreference': if self.parse_coref: if self.coref is not None: self.corefs.append(deepcopy(self.coref)) self.coref = None else: self.parse_coref = False elif tag == 'mention': mention = Mention(self.sent_idx, self.start_token_idx, self.end_token_idx, head_token_idx=self.head_token_idx, rep=self.rep, text=self.text.encode('ascii', 'ignore')) self.coref.add_mention(deepcopy(mention)) self.sent_idx = -1 self.start_token_idx = -1 self.end_token_idx = -1 self.head_token_idx = -1 self.rep = False self.text = ''
def get_cand_mentions(corpus, limit=5, check=False): """ :param corpus: 1D: n_doc, 2D: n_sents, 3D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref_id) :return: cand: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=Mention """ cand_ments = [] count = 0. max_span_len = -1 total_span_len = 0. for doc_i, doc in enumerate(corpus): doc_ments = [] for sent_i, sent in enumerate(doc): mention_spans = [] """ Extracting NP, Pro-Nom, NE mentions """ mention_spans.extend(get_np(sent)) mention_spans.extend(get_pronominals(sent)) mention_spans.extend(get_ne(sent)) """ Removing duplicates, and sorting """ mention_spans = list(set(mention_spans)) mention_spans.sort() tmp_ments = [] for span in mention_spans: span_len = span[1] - span[0] + 1 if span_len <= limit: tmp_ments.append(Mention(doc_i, sent_i, span)) if span_len > max_span_len: max_span_len = span_len total_span_len += span_len doc_ments.append(tmp_ments) count += len(tmp_ments) cand_ments.append(doc_ments) print 'Cand Mentions: %d Max Span Length: %d Avg. Span Length: %f' % ( count, max_span_len, total_span_len / count) if check: with open('cand_mentions.txt', 'w') as f: for doc, doc_ments in zip(corpus, cand_ments): for sent, sent_ments in zip(doc, doc_ments): for ment in sent_ments: print >> f, '%s' % str(ment.span) print >> f for sent_i, w in enumerate(sent): print >> f, '%d\t%s\t%s' % (sent_i, w[2].encode('utf-8'), w[-1].encode('utf-8')) print >> f return cand_ments
def __find_mesh_mentions(self, text): mesh_spans, mesh_ids = self.mesh_match.find_all_terms(text) mention_list = list() for mesh_span, mesh_id in izip(mesh_spans, mesh_ids): mention = Mention() mention.span = mesh_span mention.mtype = 'MISC' mention.mesh_id = mesh_id mention_list.append(mention) return mention_list
def get_gold_ments(doc_i, sent_i, sent): """ :param sent: 1D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref) :return: ments: 1D: n_mentions: elem=Mention """ ments = [] prev = [] for i, w in enumerate(sent): mentions = w[6].split('|') for mention in mentions: if mention.startswith('('): if mention.endswith(')'): span = (i, i) coref_id = int(mention[1:-1]) ments.append(Mention(doc_i, sent_i, span, coref_id)) else: coref_id = int(mention[1:]) prev.append(((i, i), coref_id)) else: if mention.endswith(')'): coref_id = int(mention[:-1]) for j, p in enumerate(prev): if coref_id == p[1]: span = (p[0][0], i) ments.append(Mention(doc_i, sent_i, span, coref_id)) prev.pop(j) break else: print 'Error at extract_mentions(): %s' % str(sent) exit() assert len(prev) == 0 return ments
def edl_api(): doc_text = '' if 'text' in request.values: doc_text = request.values['text'] # print doc_text # print type(doc_text) else: abort(400) json_result = '[]' try: mentions_list = list() mentions_dict = mention_extraction_web(doc_text) for result_type, mentions in mentions_dict.items(): entity_type = 'MISC' if result_type == 'results_Disease': entity_type = 'Disease' elif result_type == 'results_Chemical': entity_type = 'Chemical' for dict_mention in mentions: beg_pos = dict_mention['startChar'] end_pos = dict_mention['endChar'] meshid = None specified_type = dict_mention.get('label', None) if specified_type: entity_type = specified_type # print dict_mention # print beg_pos, end_pos, entity_type, meshid m = Mention(span=(beg_pos, end_pos), mtype=entity_type, mesh_id=meshid) mentions_list.append(m) # linked_mentions = med_link.link_mentions(mentions_list, doc_text.decode('utf-8')) linked_mentions = med_link.link_mentions(mentions_list, doc_text) json_result = json.dumps(__mentions_to_dict_list(linked_mentions)) except: print 'except' print json_result + '\n' return json_result
def produce_mention(self, serif_doc, serif_mention): mention = Mention(serif_mention.entity_type, serif_mention.mention_type.value, serif_mention.text, serif_mention.head.text, serif_doc.docid, serif_mention.syn_node.start_char, serif_mention.syn_node.end_char, serif_mention.head.start_char, serif_mention.head.end_char, serif_mention.sent_no) return mention