def _create_dataset(self, set_type, tokenizer): all_data = [] filename = os.path.join(self.data_dir, '%s.pair' % set_type) fp = open(filename, 'r', encoding='utf-8') lines = fp.readlines() fp.close() #['But the staff was so horrible to us .\n',"[(2,2),(5,5),'NEG'];[(3,3),(4,4),'POS']"] for i in range(0, len(lines), 2): text = lines[i].strip() pairs = lines[i+1].strip().split(';') text_indices = tokenizer.text_to_sequence(text) seq_len = len(text_indices) ap_tags = ['O'] * seq_len op_tags = ['O'] * seq_len ap_op_tags = ['O'] * seq_len #将ap op标注在一个句子中 triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64) #triplet_indices[ap_index,op_index]=polarity ap_spans = [] op_spans = [] triplets = [] for pair in pairs: #pair [(ap_beg, ap_end),(op_beg, op_end),polarity] pair = eval(pair) ap_beg, ap_end = pair[0] op_beg, op_end = pair[1] polarity_str = pair[2] ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1) op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1) ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1) ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1) polarity = self.polarity_map[polarity_str] triplet_indices[ap_end, op_end] = polarity if (ap_beg, ap_end) not in ap_spans: ap_spans.append((ap_beg, ap_end)) if (op_beg, op_end) not in op_spans: op_spans.append((op_beg, op_end)) triplets.append((ap_beg, ap_end, op_beg, op_end, polarity)) # convert from ot to bio ap_tags = to2bio(ap_tags) op_tags = to2bio(op_tags) ap_op_tags = to2bio(ap_op_tags) # convert tag to id ap_indices = [self.tag_map[tag] for tag in ap_tags] op_indices = [self.tag_map[tag] for tag in op_tags] #data 格式 data = { 'text_indices': text_indices, 'ap_indices': ap_indices, 'op_indices': op_indices, 'triplet_indices': triplet_indices, 'ap_spans': ap_spans, 'op_spans': op_spans, 'triplets': triplets, } all_data.append(data) return all_data
def _create_dataset(self, set_type, tokenizer): all_data = [] filename = os.path.join(self.data_dir, '%s.pair' % set_type) fp = open(filename, 'r', encoding='utf-8') lines = fp.readlines() fp.close() for i in range(0, len(lines), 2): text = lines[i].strip() pairs = lines[i+1].strip().split(';') text_indices = tokenizer.text_to_sequence(text) seq_len = len(text_indices) ap_tags = ['O'] * seq_len op_tags = ['O'] * seq_len ap_op_tags = ['O'] * seq_len triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64) ap_spans = [] op_spans = [] triplets = [] for pair in pairs: pair = eval(pair) ap_beg, ap_end = pair[0] op_beg, op_end = pair[1] polarity_str = pair[2] ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1) op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1) ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1) ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1) polarity = self.polarity_map[polarity_str] triplet_indices[ap_end, op_end] = polarity if (ap_beg, ap_end) not in ap_spans: ap_spans.append((ap_beg, ap_end)) if (op_beg, op_end) not in op_spans: op_spans.append((op_beg, op_end)) triplets.append((ap_beg, ap_end, op_beg, op_end, polarity)) # convert from ot to bio ap_tags = to2bio(ap_tags) op_tags = to2bio(op_tags) ap_op_tags = to2bio(ap_op_tags) ap_indices = [self.tag_map[tag] for tag in ap_tags] op_indices = [self.tag_map[tag] for tag in op_tags] data = { 'text_indices': text_indices, 'ap_indices': ap_indices, 'op_indices': op_indices, 'triplet_indices': triplet_indices, 'ap_spans': ap_spans, 'op_spans': op_spans, 'triplets': triplets, } all_data.append(data) return all_data
def _create_dataset(self, set_type, tokenizer): all_data = [] filename = os.path.join(self.data_dir, '%s_triplets.txt' % set_type) fp = open(filename, 'r', encoding='utf-8') lines = fp.readlines() fp.close() fp = open(filename + '.graph', 'rb') idx2gragh = pickle.load(fp) fp.close() for i in range(len(lines)): text, pairs = lines[i].strip().split('####') text_indices, text_indices_bert, position_bert_in_naive = tokenizer.text_to_sequence(text) postag_indices = tokenizer.text_to_sequence_postags(text) seq_len = len(text_indices) ap_tags = ['O'] * seq_len op_tags = ['O'] * seq_len ap_op_tags = ['O'] * seq_len triplet_indices = np.zeros((seq_len, seq_len), dtype=np.int64) ap_spans = [] op_spans = [] triplets = [] for pair in eval(pairs): ap_beg, ap_end = pair[0][0], pair[0][-1] op_beg, op_end = pair[1][0], pair[1][-1] polarity_str = pair[2] ap_tags[ap_beg:ap_end+1] = ['T'] * (ap_end-ap_beg+1) op_tags[op_beg:op_end+1] = ['T'] * (op_end-op_beg+1) ap_op_tags[ap_beg:ap_end+1] = ['T-AP'] * (ap_end-ap_beg+1) ap_op_tags[op_beg:op_end+1] = ['T-OP'] * (op_end-op_beg+1) polarity = self.polarity_map[polarity_str] triplet_indices[ap_end, op_end] = polarity if (ap_beg, ap_end) not in ap_spans: ap_spans.append((ap_beg, ap_end)) if (op_beg, op_end) not in op_spans: op_spans.append((op_beg, op_end)) triplets.append((ap_beg, ap_end, op_beg, op_end, polarity)) # convert from ot to bio ap_tags = to2bio(ap_tags) op_tags = to2bio(op_tags) ap_op_tags = to2bio(ap_op_tags) ap_indices = [self.tag_map[tag] for tag in ap_tags] op_indices = [self.tag_map[tag] for tag in op_tags] # Add dependency graph data dependency_graph = idx2gragh[i] data = { 'text_indices': text_indices, 'ap_indices': ap_indices, 'op_indices': op_indices, 'triplet_indices': triplet_indices, 'ap_spans': ap_spans, 'op_spans': op_spans, 'triplets': triplets, 'text_indices_bert': text_indices_bert, 'position_bert_in_naive': position_bert_in_naive, 'postag_indices': postag_indices, 'dependency_graph': dependency_graph, } all_data.append(data) return all_data