Exemple #1
0
    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(
                    doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[
                                tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])

        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)

        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[
                idx].size()
def getRelationInstanceForOneDoc(doc_token, entities, doc_name, data):
    X = []
    other = []

    row_num = len(entities)

    for latter_idx in range(row_num):

        for former_idx in range(row_num):

            if former_idx < latter_idx:

                former = entities[former_idx]
                latter = entities[latter_idx]


                if math.fabs(latter.sent_idx-former.sent_idx) >= data.sent_window:
                    continue

                # for double annotation, we don't generate instances
                if former.start==latter.start and former.end==latter.end:
                    continue

                #type_constraint = relationConstraint(former['type'], latter['type'])
                type_constraint = relationConstraint_chapman(former.type, latter.type)
                if type_constraint == 0:
                    continue

                # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter
                sent_idx = former.sent_idx
                context_token = pd.DataFrame(columns=doc_token.columns)
                base = 0
                former_tf_start, former_tf_end = -1, -1
                latter_tf_start, latter_tf_end = -1, -1
                while sent_idx <= latter.sent_idx:
                    sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

                    if former.sent_idx == sent_idx:
                        former_tf_start, former_tf_end = base+former.tf_start, base+former.tf_end
                    if latter.sent_idx == sent_idx:
                        latter_tf_start, latter_tf_end = base+latter.tf_start, base+latter.tf_end

                    context_token = context_token.append(sentence, ignore_index=True)

                    base += len(sentence['text'])
                    sent_idx += 1

                if context_token.shape[0] > data.max_seq_len:
                    # truncate
                    logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0]))
                    context_token = context_token.iloc[:data.max_seq_len]


                words = []
                postags = []
                cap = []
                chars = []
                positions1 = []
                positions2 = []
                former_token = []
                latter_token = []
                i = 0
                for _, token in context_token.iterrows():
                    if data.number_normalized:
                        word = utils.functions.normalize_word(token['text'])
                    else:
                        word = token['text']
                    entity_word = my_utils1.normalizeWord(token['text'])
                    words.append(data.word_alphabet.get_index(word))
                    postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag']))
                    cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(
                        str(my_utils.featureCapital(token['text']))))
                    char_for1word = []
                    for char in word:
                        char_for1word.append(data.char_alphabet.get_index(char))
                    chars.append(char_for1word)

                    if i < former_tf_start:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            former_tf_start - i))

                    elif i > former_tf_end:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            former_tf_end - i))
                        pass
                    else:
                        positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                        former_token.append(
                            data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                    if i < latter_tf_start:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            latter_tf_start - i))
                        pass
                    elif i > latter_tf_end:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(
                            latter_tf_end - i))
                        pass
                    else:
                        positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                        latter_token.append(
                            data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                    i += 1

                if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity
                    # splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", former['text'])
                    splitted = my_utils.my_tokenize(former.text)
                    for s in splitted:
                        s = s.strip()
                        if s != "":
                            former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))
                if len(latter_token) == 0:
                    #splitted = re.split(r"\s+| +|[\(\)\[\]\-_,]+", latter['text'])
                    splitted = my_utils.my_tokenize(latter.text)
                    for s in splitted:
                        s = s.strip()
                        if s != "":
                            latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))

                assert len(former_token)>0
                assert len(latter_token)>0


                features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2}
                if type_constraint == 1:
                    features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type)
                    features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type)
                    features['e1_token'] = former_token
                    features['e2_token'] = latter_token
                else:
                    features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter.type)
                    features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former.type)
                    features['e1_token'] = latter_token
                    features['e2_token'] = former_token

                features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter.tf_start-former.tf_end)

                entity_between = getEntitiesBetween(former, latter, entities)
                features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(len(entity_between))

                X.append(features)

                other.append((former, latter))

    return X, other
def getRelationInstance2(tokens, entities, relations, names, data):
    X = []
    Y = []
    cnt_neg = 0

    for i in tqdm(range(len(relations))):

        doc_relation = relations[i]
        doc_token = tokens[i]
        doc_entity = entities[i] # entity are sorted by start offset
        doc_name = names[i]

        row_num = doc_entity.shape[0]

        for latter_idx in range(row_num):

            for former_idx in range(row_num):

                if former_idx < latter_idx:

                    former = doc_entity.iloc[former_idx]
                    latter = doc_entity.iloc[latter_idx]


                    if math.fabs(latter['sent_idx']-former['sent_idx']) >= data.sent_window:
                        continue

                    # for double annotation, we don't generate instances
                    if former['start']==latter['start'] and former['end']==latter['end']:
                        continue

                    #type_constraint = relationConstraint(former['type'], latter['type'])
                    type_constraint = relationConstraint_chapman(former['type'], latter['type'])
                    if type_constraint == 0:
                        continue

                    gold_relations = doc_relation[
                        (
                                ((doc_relation['entity1_id'] == former['id']) & (
                                            doc_relation['entity2_id'] == latter['id']))
                                |
                                ((doc_relation['entity1_id'] == latter['id']) & (
                                            doc_relation['entity2_id'] == former['id']))
                        )
                    ]
                    if gold_relations.shape[0] > 1:
                        #raise RuntimeError("the same entity pair has more than one relations")
                        logging.debug("entity {} and {} has more than one relations".format(former['id'], latter['id']))
                        continue

                    # here we retrieve all the sentences inbetween two entities, sentence of former, sentence ..., sentence of latter
                    sent_idx = former['sent_idx']
                    context_token = pd.DataFrame(columns=doc_token.columns)
                    base = 0
                    former_tf_start, former_tf_end = -1, -1
                    latter_tf_start, latter_tf_end = -1, -1
                    while sent_idx <= latter['sent_idx']:
                        sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

                        if former['sent_idx'] == sent_idx:
                            former_tf_start, former_tf_end = base+former['tf_start'], base+former['tf_end']
                        if latter['sent_idx'] == sent_idx:
                            latter_tf_start, latter_tf_end = base+latter['tf_start'], base+latter['tf_end']

                        context_token = context_token.append(sentence, ignore_index=True)

                        base += len(sentence['text'])
                        sent_idx += 1

                    if context_token.shape[0] > data.max_seq_len:
                        # truncate
                        logging.debug("exceed max_seq_len {} {}".format(doc_name, context_token.shape[0]))
                        context_token = context_token.iloc[:data.max_seq_len]


                    words = []
                    postags = []
                    cap = []
                    chars = []
                    positions1 = []
                    positions2 = []
                    former_token = []
                    latter_token = []
                    i = 0
                    for _, token in context_token.iterrows():
                        if data.number_normalized:
                            word = utils.functions.normalize_word(token['text'])
                        else:
                            word = token['text']
                        entity_word = my_utils1.normalizeWord(token['text'])
                        words.append(data.word_alphabet.get_index(word))
                        postags.append(data.feature_alphabets[data.feature_name2id['[POS]']].get_index(token['postag']))
                        cap.append(data.feature_alphabets[data.feature_name2id['[Cap]']].get_index(str(my_utils.featureCapital(token['text']))))
                        char_for1word = []
                        for char in word:
                            char_for1word.append(data.char_alphabet.get_index(char))
                        chars.append(char_for1word)

                        if i < former_tf_start:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_start - i))

                        elif i > former_tf_end:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(former_tf_end - i))
                            pass
                        else:
                            positions1.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                            former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                        if i < latter_tf_start:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_start - i))
                            pass
                        elif i > latter_tf_end:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(latter_tf_end - i))
                            pass
                        else:
                            positions2.append(data.re_feature_alphabets[data.re_feature_name2id['[POSITION]']].get_index(0))
                            latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(entity_word))

                        i += 1

                    if len(former_token) == 0: # truncated part contains entity, so we have to use the text in doc_entity
                        splitted = my_utils.my_tokenize(former['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                former_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))
                    if len(latter_token) == 0:
                        splitted = my_utils.my_tokenize(latter['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                latter_token.append(data.re_feature_alphabets[data.re_feature_name2id['[ENTITY]']].get_index(my_utils1.normalizeWord(s)))

                    assert len(former_token)>0
                    assert len(latter_token)>0


                    features = {'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2}
                    if type_constraint == 1:
                        features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type'])
                        features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type'])
                        features['e1_token'] = former_token
                        features['e2_token'] = latter_token
                    else:
                        features['e1_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(latter['type'])
                        features['e2_type'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_TYPE]']].get_index(former['type'])
                        features['e1_token'] = latter_token
                        features['e2_token'] = former_token

                    features['tok_num_betw'] = data.re_feature_alphabets[data.re_feature_name2id['[TOKEN_NUM]']].get_index(latter['tf_start']-former['tf_end'])

                    entity_between = doc_entity[((doc_entity['start']>=former['end']) & (doc_entity['end']<=latter['start']))]
                    features['et_num'] = data.re_feature_alphabets[data.re_feature_name2id['[ENTITY_NUM]']].get_index(entity_between.shape[0])

                    X.append(features)

                    if gold_relations.shape[0] == 0:
                        Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>'))
                        cnt_neg += 1
                    else:
                        gold_answer = gold_relations.iloc[0]['type']
                        Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer))


    neg = 100.0*cnt_neg/len(Y)

    logging.info("positive instance {}%, negative instance {}%".format(100-neg, neg))
    return X, Y
def getRelationInstance(tokens, entities, relations, names, data):
    X = []
    Y = []
    cnt_neg = 0

    for i in tqdm(range(len(relations))):

        doc_relation = relations[i]
        doc_token = tokens[i]
        doc_entity = entities[i]  # entity are sorted by start offset
        doc_name = names[i]

        row_num = doc_entity.shape[0]

        for latter_idx in range(row_num):

            for former_idx in range(row_num):

                if former_idx < latter_idx:

                    former = doc_entity.iloc[former_idx]
                    latter = doc_entity.iloc[latter_idx]

                    if former['text'] == latter['text']:
                        continue

                    gold_relations = doc_relation[(
                        ((doc_relation['entity1_text'] == former['text']) &
                         (doc_relation['entity2_text'] == latter['text']))
                        | ((doc_relation['entity1_text'] == latter['text']) &
                           (doc_relation['entity2_text'] == former['text'])))]
                    # if gold_relations.shape[0] == 0:
                    #     raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id']))

                    context_token = doc_token
                    former_tf_start, former_tf_end = former[
                        'tf_start'], former['tf_end']
                    latter_tf_start, latter_tf_end = latter[
                        'tf_start'], latter['tf_end']

                    if context_token.shape[0] > data.max_seq_len:
                        # truncate
                        logging.debug("exceed max_seq_len {} {}".format(
                            doc_name, context_token.shape[0]))
                        context_token = context_token.iloc[:data.max_seq_len]

                    words = []
                    postags = []
                    cap = []
                    chars = []
                    positions1 = []
                    positions2 = []
                    former_token = []
                    latter_token = []
                    i = 0
                    for _, token in context_token.iterrows():
                        if data.number_normalized:
                            word = normalize_word(token['text'])
                        else:
                            word = token['text']
                        entity_word = my_utils1.normalizeWord(token['text'])
                        words.append(data.word_alphabet.get_index(word))
                        postags.append(data.feature_alphabets[
                            data.feature_name2id['[POS]']].get_index(
                                token['postag']))
                        cap.append(data.feature_alphabets[
                            data.feature_name2id['[Cap]']].get_index(
                                str(my_utils.featureCapital(token['text']))))
                        char_for1word = []
                        for char in word:
                            char_for1word.append(
                                data.char_alphabet.get_index(char))
                        chars.append(char_for1word)

                        if i < former_tf_start:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_start - i))

                        elif i > former_tf_end:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_end - i))

                        else:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            former_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        if i < latter_tf_start:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_start - i))

                        elif i > latter_tf_end:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_end - i))

                        else:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            latter_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        i += 1

                    if len(
                            former_token
                    ) == 0:  # truncated part contains entity, so we have to use the text in doc_entity
                        splitted = my_utils.my_tokenize(former['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                former_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))
                    if len(latter_token) == 0:
                        splitted = my_utils.my_tokenize(latter['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                latter_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))

                    assert len(former_token) > 0
                    assert len(latter_token) > 0

                    features = {
                        'tokens': words,
                        'postag': postags,
                        'cap': cap,
                        'char': chars,
                        'positions1': positions1,
                        'positions2': positions2
                    }
                    features['e1_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            former['type'])
                    features['e2_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            latter['type'])
                    features['e1_token'] = former_token
                    features['e2_token'] = latter_token

                    features['tok_num_betw'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[TOKEN_NUM]']].get_index(
                            latter['tf_start'] - former['tf_end'])

                    entity_between = doc_entity[(
                        (doc_entity['start'] >= former['end']) &
                        (doc_entity['end'] <= latter['start']))]
                    features['et_num'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_NUM]']].get_index(
                            entity_between.shape[0])

                    X.append(features)

                    gold_answer = '</unk>'
                    for _, gold_relation in gold_relations.iterrows():
                        if gold_relation['type'] != 'None':
                            gold_answer = gold_relation['type']
                            break

                    Y.append(data.re_feature_alphabets[data.re_feature_name2id[
                        '[RELATION]']].get_index(gold_answer))
                    if gold_answer == '</unk>':
                        cnt_neg += 1

                    # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None':
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>'))
                    #     cnt_neg += 1
                    # else:
                    #     gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type']
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer))

    neg = 100.0 * cnt_neg / len(Y)

    logging.info("positive instance {}%, negative instance {}%".format(
        100 - neg, neg))
    return X, Y