Beispiel #1
0
    def _create_dataset(self, set_type, tokenizer):
        all_data = []

        filename = os.path.join(self.data_dir, '%s.pair' % set_type)
        fp = open(filename, 'r', encoding='utf-8')
        lines = fp.readlines()
        fp.close()
        #['But the staff was so horrible to us .\n',"[(2,2),(5,5),'NEG'];[(3,3),(4,4),'POS']"]
        for i in range(0, len(lines), 2):
            text = lines[i].strip()
            pairs = lines[i+1].strip().split(';')

            text_indices = tokenizer.text_to_sequence(text)
            seq_len = len(text_indices)
            ap_tags = ['O'] * seq_len
            op_tags = ['O'] * seq_len
            ap_op_tags = ['O'] * seq_len #将ap op标注在一个句子中

            triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64) #triplet_indices[ap_index,op_index]=polarity
            ap_spans = []
            op_spans = []
            triplets = []
            for pair in pairs: #pair [(ap_beg, ap_end),(op_beg, op_end),polarity]
                pair = eval(pair)
                ap_beg, ap_end = pair[0]
                op_beg, op_end = pair[1]
                polarity_str = pair[2]
                ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1)
                op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1)
                ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1)
                ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1)
                polarity = self.polarity_map[polarity_str]
                triplet_indices[ap_end, op_end] = polarity
                if (ap_beg, ap_end) not in ap_spans:
                    ap_spans.append((ap_beg, ap_end))
                if (op_beg, op_end) not in op_spans:
                    op_spans.append((op_beg, op_end))
                triplets.append((ap_beg, ap_end, op_beg, op_end, polarity))

            # convert from ot to bio
            ap_tags = to2bio(ap_tags)
            op_tags = to2bio(op_tags)
            ap_op_tags = to2bio(ap_op_tags)
            # convert tag to id
            ap_indices = [self.tag_map[tag] for tag in ap_tags]
            op_indices = [self.tag_map[tag] for tag in op_tags]
            #data 格式
            data = {
                'text_indices': text_indices,
                'ap_indices': ap_indices,
                'op_indices': op_indices,
                'triplet_indices': triplet_indices,
                'ap_spans': ap_spans,
                'op_spans': op_spans,
                'triplets': triplets,
            }
            all_data.append(data)
        
        return all_data
Beispiel #2
0
    def _create_dataset(self, set_type, tokenizer):
        all_data = []

        filename = os.path.join(self.data_dir, '%s.pair' % set_type)
        fp = open(filename, 'r', encoding='utf-8')
        lines = fp.readlines()
        fp.close()

        for i in range(0, len(lines), 2):
            text = lines[i].strip()
            pairs = lines[i+1].strip().split(';')

            text_indices = tokenizer.text_to_sequence(text)
            seq_len = len(text_indices)
            ap_tags = ['O'] * seq_len
            op_tags = ['O'] * seq_len
            ap_op_tags = ['O'] * seq_len

            triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64)
            ap_spans = []
            op_spans = []
            triplets = []
            for pair in pairs:
                pair = eval(pair)
                ap_beg, ap_end = pair[0]
                op_beg, op_end = pair[1]
                polarity_str = pair[2]
                ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1)
                op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1)
                ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1)
                ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1)
                polarity = self.polarity_map[polarity_str]
                triplet_indices[ap_end, op_end] = polarity
                if (ap_beg, ap_end) not in ap_spans:
                    ap_spans.append((ap_beg, ap_end))
                if (op_beg, op_end) not in op_spans:
                    op_spans.append((op_beg, op_end))
                triplets.append((ap_beg, ap_end, op_beg, op_end, polarity))

            # convert from ot to bio
            ap_tags = to2bio(ap_tags)
            op_tags = to2bio(op_tags)
            ap_op_tags = to2bio(ap_op_tags)

            ap_indices = [self.tag_map[tag] for tag in ap_tags]
            op_indices = [self.tag_map[tag] for tag in op_tags]

            data = {
                'text_indices': text_indices,
                'ap_indices': ap_indices,
                'op_indices': op_indices,
                'triplet_indices': triplet_indices,
                'ap_spans': ap_spans,
                'op_spans': op_spans,
                'triplets': triplets,
            }
            all_data.append(data)
        
        return all_data
Beispiel #3
0
    def _create_dataset(self, set_type, tokenizer):
        all_data = []

        filename = os.path.join(self.data_dir, '%s_triplets.txt' % set_type)
        fp = open(filename, 'r', encoding='utf-8')
        lines = fp.readlines()
        fp.close()

        fp = open(filename + '.graph', 'rb')
        idx2gragh = pickle.load(fp)
        fp.close()

        for i in range(len(lines)):
            text, pairs = lines[i].strip().split('####')
            text_indices, text_indices_bert, position_bert_in_naive = tokenizer.text_to_sequence(text)
            postag_indices = tokenizer.text_to_sequence_postags(text)
            seq_len = len(text_indices)
            ap_tags = ['O'] * seq_len
            op_tags = ['O'] * seq_len
            ap_op_tags = ['O'] * seq_len

            triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64)
            ap_spans = []
            op_spans = []
            triplets = []
            for pair in eval(pairs):
                ap_beg, ap_end = pair[0][0], pair[0][-1]
                op_beg, op_end = pair[1][0], pair[1][-1]
                polarity_str = pair[2]
                ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1)
                op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1)
                ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1)
                ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1)
                polarity = self.polarity_map[polarity_str]
                triplet_indices[ap_end, op_end] = polarity
                if (ap_beg, ap_end) not in ap_spans:
                    ap_spans.append((ap_beg, ap_end))
                if (op_beg, op_end) not in op_spans:
                    op_spans.append((op_beg, op_end))
                triplets.append((ap_beg, ap_end, op_beg, op_end, polarity))

            # convert from ot to bio
            ap_tags = to2bio(ap_tags)
            op_tags = to2bio(op_tags)
            ap_op_tags = to2bio(ap_op_tags)

            ap_indices = [self.tag_map[tag] for tag in ap_tags]
            op_indices = [self.tag_map[tag] for tag in op_tags]

            # Add dependency graph data
            dependency_graph = idx2gragh[i]

            data = {
                'text_indices': text_indices,
                'ap_indices': ap_indices,
                'op_indices': op_indices,
                'triplet_indices': triplet_indices,
                'ap_spans': ap_spans,
                'op_spans': op_spans,
                'triplets': triplets,
                'text_indices_bert': text_indices_bert,
                'position_bert_in_naive': position_bert_in_naive,
                'postag_indices': postag_indices,
                'dependency_graph': dependency_graph,
            }
            all_data.append(data)
        
        return all_data