def build_re_feature_alphabets(self, tokens, entities, relations): entity_type_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_TYPE]']] entity_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY]']] relation_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[RELATION]']] token_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[TOKEN_NUM]']] entity_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_NUM]']] position_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[POSITION]']] for i, doc_token in enumerate(tokens): doc_entity = entities[i] doc_relation = relations[i] sent_idx = 0 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] while sentence.shape[0] != 0: entities_in_sentence = doc_entity[( doc_entity['sent_idx'] == sent_idx)] for _, entity in entities_in_sentence.iterrows(): entity_type_alphabet.add(entity['type']) tk_idx = entity['tf_start'] while tk_idx <= entity['tf_end']: entity_alphabet.add( my_utils1.normalizeWord(sentence.iloc[ tk_idx, 0])) # assume 'text' is in 0 column tk_idx += 1 sent_idx += 1 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] for _, relation in doc_relation.iterrows(): relation_alphabet.add(relation['type']) for i in range(data.max_seq_len): token_num_alphabet.add(i) entity_num_alphabet.add(i) position_alphabet.add(i) position_alphabet.add(-i) for idx in range(self.re_feature_num): self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[ idx].size()
def getRelationInstanceForOneDoc(doc_token, entities, doc_name, data): X = [] other = [] row_num = len(entities) for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = entities[former_idx] latter = entities[latter_idx] if math.fabs(latter.sent_idx-former.sent_idx) >= data.sent_window: continue # for double annotation, we don't generate instances if former.start==latter.start and former.end==latter.end: continue #type_constraint = relationConstraint(former['type'], latter['type']) type_constraint = relationConstraint_chapman(former.type, latter.type) if type_constraint == 0: continue # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter sent_idx = former.sent_idx context_token = pd.DataFrame(columns=doc_token.columns) base = 0 former_tf_start, former_tf_end = -1, -1 latter_tf_start, latter_tf_end = -1, -1 while sent_idx <= latter.sent_idx: sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] if former.sent_idx == sent_idx: former_tf_start, former_tf_end = base+former.tf_start, base+former.tf_end if latter.sent_idx == sent_idx: latter_tf_start, latter_tf_end = base+latter.tf_start, base+latter.tf_end context_token = context_token.append(sentence, ignore_index=True) base += len(sentence['text']) sent_idx += 1 if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = utils.functions.normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag'])) cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index( str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append(data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( former_tf_end - i)) pass else: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) former_token.append( data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( latter_tf_start - i)) pass elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index( latter_tf_end - i)) pass else: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) latter_token.append( data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) i += 1 if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity # splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", former['text']) splitted = my_utils.my_tokenize(former.text) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) if len(latter_token) == 0: #splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", latter['text']) splitted = my_utils.my_tokenize(latter.text) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) assert len(former_token)>0 assert len(latter_token)>0 features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2} if type_constraint == 1: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type) features['e1_token'] = former_token features['e2_token'] = latter_token else: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type) features['e1_token'] = latter_token features['e2_token'] = former_token features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter.tf_start-former.tf_end) entity_between = getEntitiesBetween(former, latter, entities) features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(len(entity_between)) X.append(features) other.append((former, latter)) return X, other
def getRelationInstance2(tokens, entities, relations, names, data): X = [] Y = [] cnt_neg = 0 for i in tqdm(range(len(relations))): doc_relation = relations[i] doc_token = tokens[i] doc_entity = entities[i] # entity are sorted by start offset doc_name = names[i] row_num = doc_entity.shape[0] for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = doc_entity.iloc[former_idx] latter = doc_entity.iloc[latter_idx] if math.fabs(latter['sent_idx']-former['sent_idx']) >= data.sent_window: continue # for double annotation, we don't generate instances if former['start']==latter['start'] and former['end']==latter['end']: continue #type_constraint = relationConstraint(former['type'], latter['type']) type_constraint = relationConstraint_chapman(former['type'], latter['type']) if type_constraint == 0: continue gold_relations = doc_relation[ ( ((doc_relation['entity1_id'] == former['id']) & ( doc_relation['entity2_id'] == latter['id'])) | ((doc_relation['entity1_id'] == latter['id']) & ( doc_relation['entity2_id'] == former['id'])) ) ] if gold_relations.shape[0] > 1: #raise RuntimeError("the same entity pair has more than one relations") logging.debug("entity {} and {} has more than one relations".format(former['id'], latter['id'])) continue # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter sent_idx = former['sent_idx'] context_token = pd.DataFrame(columns=doc_token.columns) base = 0 former_tf_start, former_tf_end = -1, -1 latter_tf_start, latter_tf_end = -1, -1 while sent_idx <= latter['sent_idx']: sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] if former['sent_idx'] == sent_idx: former_tf_start, former_tf_end = base+former['tf_start'], base+former['tf_end'] if latter['sent_idx'] == sent_idx: latter_tf_start, latter_tf_end = base+latter['tf_start'], base+latter['tf_end'] context_token = context_token.append(sentence, ignore_index=True) base += len(sentence['text']) sent_idx += 1 if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = utils.functions.normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag'])) cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append(data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_end - i)) pass else: positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_start - i)) pass elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_end - i)) pass else: positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0)) latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word)) i += 1 if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity splitted = my_utils.my_tokenize(former['text']) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) if len(latter_token) == 0: splitted = my_utils.my_tokenize(latter['text']) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s))) assert len(former_token)>0 assert len(latter_token)>0 features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2} if type_constraint == 1: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type']) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type']) features['e1_token'] = former_token features['e2_token'] = latter_token else: features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type']) features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type']) features['e1_token'] = latter_token features['e2_token'] = former_token features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter['tf_start']-former['tf_end']) entity_between = doc_entity[((doc_entity['start']>=former['end']) & (doc_entity['end']<=latter['start']))] features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(entity_between.shape[0]) X.append(features) if gold_relations.shape[0] == 0: Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>')) cnt_neg += 1 else: gold_answer = gold_relations.iloc[0]['type'] Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer)) neg = 100.0*cnt_neg/len(Y) logging.info("positive instance {}%, negative instance {}%".format(100-neg, neg)) return X, Y
def getRelationInstance(tokens, entities, relations, names, data): X = [] Y = [] cnt_neg = 0 for i in tqdm(range(len(relations))): doc_relation = relations[i] doc_token = tokens[i] doc_entity = entities[i] # entity are sorted by start offset doc_name = names[i] row_num = doc_entity.shape[0] for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = doc_entity.iloc[former_idx] latter = doc_entity.iloc[latter_idx] if former['text'] == latter['text']: continue gold_relations = doc_relation[( ((doc_relation['entity1_text'] == former['text']) & (doc_relation['entity2_text'] == latter['text'])) | ((doc_relation['entity1_text'] == latter['text']) & (doc_relation['entity2_text'] == former['text'])))] # if gold_relations.shape[0] == 0: # raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id'])) context_token = doc_token former_tf_start, former_tf_end = former[ 'tf_start'], former['tf_end'] latter_tf_start, latter_tf_end = latter[ 'tf_start'], latter['tf_end'] if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format( doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[ data.feature_name2id['[POS]']].get_index( token['postag'])) cap.append(data.feature_alphabets[ data.feature_name2id['[Cap]']].get_index( str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append( data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_end - i)) else: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) former_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_start - i)) elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_end - i)) else: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) latter_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) i += 1 if len( former_token ) == 0: # truncated part contains entity, so we have to use the text in doc_entity splitted = my_utils.my_tokenize(former['text']) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) if len(latter_token) == 0: splitted = my_utils.my_tokenize(latter['text']) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) assert len(former_token) > 0 assert len(latter_token) > 0 features = { 'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2 } features['e1_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( former['type']) features['e2_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( latter['type']) features['e1_token'] = former_token features['e2_token'] = latter_token features['tok_num_betw'] = data.re_feature_alphabets[ data.re_feature_name2id['[TOKEN_NUM]']].get_index( latter['tf_start'] - former['tf_end']) entity_between = doc_entity[( (doc_entity['start'] >= former['end']) & (doc_entity['end'] <= latter['start']))] features['et_num'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_NUM]']].get_index( entity_between.shape[0]) X.append(features) gold_answer = '</unk>' for _, gold_relation in gold_relations.iterrows(): if gold_relation['type'] != 'None': gold_answer = gold_relation['type'] break Y.append(data.re_feature_alphabets[data.re_feature_name2id[ '[RELATION]']].get_index(gold_answer)) if gold_answer == '</unk>': cnt_neg += 1 # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None': # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>')) # cnt_neg += 1 # else: # gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type'] # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer)) neg = 100.0 * cnt_neg / len(Y) logging.info("positive instance {}%, negative instance {}%".format( 100 - neg, neg)) return X, Y