Ejemplo n.º 1
0
 def add_alphabet(self, input_file):
     with open(input_file, 'r') as rf:
         for i, line in enumerate(rf):
             if len(line) > 2:
                 pairs = line.strip().split()
                 word = pairs[0]
                 word = normalize_word(word)  # 由数字组成的word,直接置为'0'
                 feat = pairs[1].replace('[Lexi]', '')
                 label = pairs[-1]
                 self.words.append(word)
                 self.feats.append(feat)
                 self.labels.append(label)
                 for char in word:
                     self.chars.append(char)
Ejemplo n.º 2
0
    def build_alphabet(self, alphabet_path):
        for line in self.corpus:
            line = ast.literal_eval(line)
            char, char_label, seg_list, intent = line['char'], line[
                'char_label'], line['word'], line['intent']
            for word in seg_list:
                # lexicon
                lexi_feat = []
                for lexi_type, lb in self.trees.lexi_trees.items():
                    lexi_feat.append(lb.search(word))
                for n in range(len(lexi_feat)):
                    if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                        lexi_feat[n] = 0
                    else:
                        lexi_feat[n] = 1
                lexi_feat = ''.join([str(i) for i in lexi_feat])
                # 抽象成一个字符
                self.char_alphabet.add(lexi_feat)
            # char
            for c in char:
                self.char_alphabet.add(normalize_word(c))
            # intent
            self.intent_alphabet.add(intent)
            # label
            for label in char_label:
                self.label_alphabet.add(label)
        # alphabet_size
        self.char_alphabet_size = self.char_alphabet.size()
        self.intent_alphabet_size = self.intent_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        # close alphabet
        self.fix_alphabet()

        # write alphabet:
        if not os.path.exists(alphabet_path):
            with open(alphabet_path, 'wb') as wbf:
                pickle.dump(self.char_alphabet.instance2index, wbf)
                pickle.dump(self.intent_alphabet.instance2index, wbf)
                pickle.dump(self.label_alphabet.instance2index, wbf)
                pickle.dump(self.label_alphabet.instances, wbf)
                pickle.dump(self.char_alphabet_size, wbf)
                pickle.dump(self.intent_alphabet_size, wbf)
                pickle.dump(self.label_alphabet_size, wbf)
    def read_instance(self, input_file):
        chars, words, feats, labels = [], [], [], []
        char_ids, word_ids, feat_ids, label_ids = [], [], [], []
        name = input_file.split('/')[-1].split('_')[0]
        with open(input_file, 'r') as rf:
            for line in rf:
                if len(line) > 3:
                    pairs = line.strip().split()

                    word = pairs[0]
                    word = normalize_word(word)  # 如果word由数字构成,转为'0'
                    feat = pairs[1].split(']', 1)[-1]
                    label = pairs[-1]

                    words.append(word)
                    feats.append(feat)
                    labels.append(label)

                    word_id = self.word_alphabet[word]
                    feat_id = self.feat_alphabet[feat]
                    label_id = self.label_alphabet[label]

                    word_ids.append(word_id)
                    feat_ids.append(feat_id)
                    label_ids.append(label_id)

                    char_list = []
                    char_id = []
                    for char in word:
                        char_list.append(char)
                        char_id.append(self.char_alphabet[char])
                    chars.append(char_list)
                    char_ids.append(char_id)

                else:
                    if name == 'train' and len(words) > 0 and (
                            len(words) < config.max_sent_length):
                        self.train_texts.append([chars, words, feats, labels])
                        self.train_ids.append(
                            [char_ids, word_ids, feat_ids, label_ids])
                    elif name == 'dev' and len(words) > 0 and (
                            len(words) < config.max_sent_length):
                        self.dev_texts.append([chars, words, feats, labels])
                        self.dev_ids.append(
                            [char_ids, word_ids, feat_ids, label_ids])
                    elif name == 'test' and len(words) > 0 and (
                            len(words) < config.max_sent_length):
                        self.test_texts.append([chars, words, feats, labels])
                        self.test_ids.append(
                            [char_ids, word_ids, feat_ids, label_ids])

                    chars, words, feats, labels = [], [], [], []
                    char_ids, word_ids, feat_ids, label_ids = [], [], [], []
            # 防止漏掉最后一行样本
            if len(words) > 0 and (len(words) < config.max_sent_length):
                if name == 'train':
                    self.train_texts.append([chars, words, feats, labels])
                    self.train_ids.append(
                        [char_ids, word_ids, feat_ids, label_ids])
                elif name == 'dev':
                    self.dev_texts.append([chars, words, feats, labels])
                    self.dev_ids.append(
                        [char_ids, word_ids, feat_ids, label_ids])
                elif name == 'test':
                    self.test_texts.append([chars, words, feats, labels])
                    self.test_ids.append(
                        [char_ids, word_ids, feat_ids, label_ids])
Ejemplo n.º 4
0
def getRelationInstance(tokens, entities, relations, names, data):
    X = []
    Y = []
    cnt_neg = 0

    for i in tqdm(range(len(relations))):

        doc_relation = relations[i]
        doc_token = tokens[i]
        doc_entity = entities[i]  # entity are sorted by start offset
        doc_name = names[i]

        row_num = doc_entity.shape[0]

        for latter_idx in range(row_num):

            for former_idx in range(row_num):

                if former_idx < latter_idx:

                    former = doc_entity.iloc[former_idx]
                    latter = doc_entity.iloc[latter_idx]

                    if former['text'] == latter['text']:
                        continue

                    gold_relations = doc_relation[(
                        ((doc_relation['entity1_text'] == former['text']) &
                         (doc_relation['entity2_text'] == latter['text']))
                        | ((doc_relation['entity1_text'] == latter['text']) &
                           (doc_relation['entity2_text'] == former['text'])))]
                    # if gold_relations.shape[0] == 0:
                    #     raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id']))

                    context_token = doc_token
                    former_tf_start, former_tf_end = former[
                        'tf_start'], former['tf_end']
                    latter_tf_start, latter_tf_end = latter[
                        'tf_start'], latter['tf_end']

                    if context_token.shape[0] > data.max_seq_len:
                        # truncate
                        logging.debug("exceed max_seq_len {} {}".format(
                            doc_name, context_token.shape[0]))
                        context_token = context_token.iloc[:data.max_seq_len]

                    words = []
                    postags = []
                    cap = []
                    chars = []
                    positions1 = []
                    positions2 = []
                    former_token = []
                    latter_token = []
                    i = 0
                    for _, token in context_token.iterrows():
                        if data.number_normalized:
                            word = normalize_word(token['text'])
                        else:
                            word = token['text']
                        entity_word = my_utils1.normalizeWord(token['text'])
                        words.append(data.word_alphabet.get_index(word))
                        postags.append(data.feature_alphabets[
                            data.feature_name2id['[POS]']].get_index(
                                token['postag']))
                        cap.append(data.feature_alphabets[
                            data.feature_name2id['[Cap]']].get_index(
                                str(my_utils.featureCapital(token['text']))))
                        char_for1word = []
                        for char in word:
                            char_for1word.append(
                                data.char_alphabet.get_index(char))
                        chars.append(char_for1word)

                        if i < former_tf_start:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_start - i))

                        elif i > former_tf_end:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(former_tf_end - i))

                        else:
                            positions1.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            former_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        if i < latter_tf_start:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_start - i))

                        elif i > latter_tf_end:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(latter_tf_end - i))

                        else:
                            positions2.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[POSITION]']].
                                              get_index(0))
                            latter_token.append(data.re_feature_alphabets[
                                data.re_feature_name2id['[ENTITY]']].get_index(
                                    entity_word))

                        i += 1

                    if len(
                            former_token
                    ) == 0:  # truncated part contains entity, so we have to use the text in doc_entity
                        splitted = my_utils.my_tokenize(former['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                former_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))
                    if len(latter_token) == 0:
                        splitted = my_utils.my_tokenize(latter['text'])
                        for s in splitted:
                            s = s.strip()
                            if s != "":
                                latter_token.append(data.re_feature_alphabets[
                                    data.
                                    re_feature_name2id['[ENTITY]']].get_index(
                                        my_utils1.normalizeWord(s)))

                    assert len(former_token) > 0
                    assert len(latter_token) > 0

                    features = {
                        'tokens': words,
                        'postag': postags,
                        'cap': cap,
                        'char': chars,
                        'positions1': positions1,
                        'positions2': positions2
                    }
                    features['e1_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            former['type'])
                    features['e2_type'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_TYPE]']].get_index(
                            latter['type'])
                    features['e1_token'] = former_token
                    features['e2_token'] = latter_token

                    features['tok_num_betw'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[TOKEN_NUM]']].get_index(
                            latter['tf_start'] - former['tf_end'])

                    entity_between = doc_entity[(
                        (doc_entity['start'] >= former['end']) &
                        (doc_entity['end'] <= latter['start']))]
                    features['et_num'] = data.re_feature_alphabets[
                        data.re_feature_name2id['[ENTITY_NUM]']].get_index(
                            entity_between.shape[0])

                    X.append(features)

                    gold_answer = '</unk>'
                    for _, gold_relation in gold_relations.iterrows():
                        if gold_relation['type'] != 'None':
                            gold_answer = gold_relation['type']
                            break

                    Y.append(data.re_feature_alphabets[data.re_feature_name2id[
                        '[RELATION]']].get_index(gold_answer))
                    if gold_answer == '</unk>':
                        cnt_neg += 1

                    # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None':
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>'))
                    #     cnt_neg += 1
                    # else:
                    #     gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type']
                    #     Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer))

    neg = 100.0 * cnt_neg / len(Y)

    logging.info("positive instance {}%, negative instance {}%".format(
        100 - neg, neg))
    return X, Y
Ejemplo n.º 5
0
def read_instanceFromBuffer(in_lines,
                            word_alphabet,
                            char_alphabet,
                            feature_alphabets,
                            label_alphabet,
                            number_normalized,
                            max_sent_length,
                            char_padding_size=-1,
                            char_padding_symbol='</pad>'):
    feature_num = len(feature_alphabets)
    instence_texts = []
    instence_Ids = []
    words = []
    features = []
    chars = []
    labels = []
    word_Ids = []
    feature_Ids = []
    char_Ids = []
    label_Ids = []
    for line in in_lines:
        if len(line) > 2:
            pairs = line.strip().split()
            word = pairs[0].decode('utf-8')
            if number_normalized:
                word = normalize_word(word)
            label = pairs[-1]
            words.append(word)
            labels.append(label)
            word_Ids.append(word_alphabet.get_index(word))
            label_Ids.append(label_alphabet.get_index(label))
            ## get features
            feat_list = []
            feat_Id = []
            for idx in range(feature_num):
                feat_idx = pairs[idx + 1].split(']', 1)[-1]
                feat_list.append(feat_idx)
                feat_Id.append(feature_alphabets[idx].get_index(feat_idx))
            features.append(feat_list)
            feature_Ids.append(feat_Id)
            ## get char
            char_list = []
            char_Id = []
            for char in word:
                char_list.append(char)
            if char_padding_size > 0:
                char_number = len(char_list)
                if char_number < char_padding_size:
                    char_list = char_list + [char_padding_symbol] * (
                        char_padding_size - char_number)
                assert (len(char_list) == char_padding_size)
            else:
                ### not padding
                pass
            for char in char_list:
                char_Id.append(char_alphabet.get_index(char))
            chars.append(char_list)
            char_Ids.append(char_Id)
        else:
            if (max_sent_length < 0) or (len(words) < max_sent_length):
                instence_texts.append([words, features, chars, labels])
                instence_Ids.append(
                    [word_Ids, feature_Ids, char_Ids, label_Ids])
            words = []
            features = []
            chars = []
            labels = []
            word_Ids = []
            feature_Ids = []
            char_Ids = []
            label_Ids = []
    return instence_texts, instence_Ids
    def inference(self,
                  text,
                  intent,
                  session_keep,
                  previous_intent,
                  trace_id=''):
        """

		:param text:
		:param intent: 当前意图
		:param session_keep:
		:param previous_intent: 上一轮意图
		:param trace_id:
		:return:
		"""
        # 如果存在多轮对话,且当前intent为空,取上一轮text的意图
        if session_keep and intent is None:
            intent = previous_intent
        if intent is None:  # label_alphabet中的None是str类型
            intent = 'None'
        instance, instance_ids = [], []
        # 处理当前会话
        new_char, seq_char, seq_char_id_list, seq_label, seq_label_id_list = [], [], [], [], []
        char, seg_list = list(text), self.process(self.stub, text, trace_id)
        # 存储one-hot形式的属性特征
        lexicons = []
        # word level
        # 记录字符的index
        word_indices = []
        start = 0
        for word in seg_list:
            end = start + len(word)
            lexi_feat = []
            for lexi_type, lb in self.data.trees.lexi_trees.items():
                lexi_feat.append(lb.search(word))
            for n in range(len(lexi_feat)):
                if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                    lexi_feat[n] = 0
                else:
                    lexi_feat[n] = 1
            lexi_feat = ''.join([str(i) for i in lexi_feat])
            lexicons.append(lexi_feat)
            word_indices.append([start, end])

            # char
            # '0010000'
            if '1' in lexi_feat:
                seq_char.append(lexi_feat)
                seq_char_id_list.append(
                    self.data.char_alphabet.get_index(lexi_feat))
                new_char.append(''.join(char[start:end]))
            else:  # '0000000'
                for c in word:
                    seq_char.append(c)
                    seq_char_id_list.append(
                        self.data.char_alphabet.get_index(normalize_word(c)))
                    new_char.append(c)
            start = end
        # intent
        intent_id = self.data.intent_alphabet.get_index(intent)
        instance.append([seq_char, [intent]])
        instance_ids.append([seq_char_id_list, [intent_id]])
        # instance process
        batch_char, batch_intent, batch_char_len, mask, batch_char_recover, _ = \
         batch_char_sequence_labeling_process(instance_ids, self.gpu, self.char_max_length, False)
        tag_seq = self.model(batch_char, batch_intent, batch_char_len, mask)
        # label recover
        pred_result = self.predict_recover_label(tag_seq, mask,
                                                 self.data.label_alphabet)
        pred_result = list(np.array(pred_result).reshape(len(seq_char), ))
        result = self.slot_concat(new_char, pred_result)

        return result
Ejemplo n.º 7
0
    def read_instance(self):
        """
		这里读取完整读数据,不做截断,functions.py中做截断
		:return:
		"""
        texts, ids = [], []
        for idx, line in enumerate(self.corpus):
            line = ast.literal_eval(line)
            intent_id_list = []
            # word:'0010000' -> 合并成一个标签
            seq_char, seq_char_id_list, seq_label, seq_label_id_list = [], [], [], []
            char, char_label, seg_list, intent = line['char'], line[
                'char_label'], line['word'], line['intent']
            # 存储one-hot形式的属性特征
            lexicons = []
            # 记录字符的index
            word_indices = []
            start = 0
            flag = True  # 判断跳至上一循环
            for word in seg_list:
                if flag is True:
                    end = start + len(word)
                    lexi_feat = []
                    for lexi_type, lb in self.trees.lexi_trees.items():
                        lexi_feat.append(lb.search(word))
                    for n in range(len(lexi_feat)):
                        if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                            lexi_feat[n] = 0
                        else:
                            lexi_feat[n] = 1
                    lexi_feat = ''.join([str(i) for i in lexi_feat])
                    lexicons.append(lexi_feat)
                    word_indices.append([start, end])

                    # char
                    # '0010000'
                    if '1' in lexi_feat:
                        seq_char.append(lexi_feat)
                        seq_char_id_list.append(
                            self.char_alphabet.get_index(lexi_feat))
                        # ["B-room", "I-room", "I-room"]
                        specific_word_label = char_label[start:end]
                        tmp_label = [
                            swl.split('-')[-1] for swl in specific_word_label
                        ]
                        if len(set(tmp_label)) > 1:
                            # 判断是否过滤该条数据
                            print('Be filtered: %s' % line['text'], word,
                                  tmp_label)
                            flag = False
                        else:
                            assert len(set(tmp_label)) == 1
                            if tmp_label[0] == 'O':
                                tmp_label = 'O'
                            else:
                                tmp_label = 'B' + '-' + tmp_label[0]
                            seq_label += [tmp_label]
                            seq_label_id_list += [
                                self.label_alphabet.get_index(tmp_label)
                            ]
                    # '0000000'
                    else:
                        for c in word:
                            seq_char.append(c)
                            seq_char_id_list.append(
                                self.char_alphabet.get_index(
                                    normalize_word(c)))
                        seq_label += char_label[start:end]
                        seq_label_id_list += [
                            self.label_alphabet.get_index(cl)
                            for cl in char_label[start:end]
                        ]

                    start = end
                else:
                    break  # 跳至下一个corpus

            intent_id_list.append(self.intent_alphabet.get_index(intent))

            if idx % 10000 == 0:
                logger.info('read instance : %s' % idx)

            if flag is True:
                # text, char, intent, sequence_label
                texts.append([line['text'], seq_char, intent, seq_label])
                ids.append(
                    [seq_char_id_list, intent_id_list, seq_label_id_list])

        # 新形式的corpus的保存下来,方便查bug
        output_path = self.data_config['data']['output']
        with open(output_path, 'w') as wf:
            for text in texts:
                line_data = dict()
                line_data['text'] = text[0]
                line_data['char'] = text[1]
                line_data['intent'] = text[2]
                line_data['char_label'] = text[-1]
                wf.write(json.dumps(line_data, ensure_ascii=False) + '\n')

        return texts, ids