def load_data(filename): """加载数据 单条格式:(文本1 ids, 文本2 ids, 标签id) """ D = [] with open(filename) as f: for l in f: aa = l.strip().split("|,|")[1] bb = l.strip().split("|,|")[2] a = [] for d in aa.split(' '): if not d == '': a.append(d) b = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] for l in bb.split(' '): if not l == '': index = ((int)(l) - 1) b[index] = b[index] + 1 truncate_sequences(maxlen, -1, a) D.append((a, a, b)) # train_x = data.get_description(filename) # train_y = data.get_label(filename) # D.append((train_x, train_y)) return D
def load_data(filename): """加载数据 单条格式:(文本1 ids, 文本2 ids, 标签id) """ D = [] with open(filename) as f: for l in f: l = l.strip().split('\t') if len(l) == 3: a, b, c = l[0], l[1], int(l[2]) else: a, b, c = l[0], l[1], -5 # 未标注数据,标签为-5 a = [int(i) for i in a.split(' ')] b = [int(i) for i in b.split(' ')] truncate_sequences(maxlen, -1, a, b) D.append((a, b, c)) return D
def tokenize(self, text, maxlen=None): """分词函数 """ tokens = [ self._token_translate.get(token) or token for token in self._tokenize(text) ] if self._token_start is not None: tokens.insert(0, self._token_start) if self._token_end is not None: tokens.append(self._token_end) if maxlen is not None: index = int(self._token_end is not None) + 1 truncate_sequences(maxlen, -index, tokens) return tokens
def encode(self, first_text, second_text=None, maxlen=None, pattern='S*E*E', truncate_from='right'): """输出文本对应token id和segment id """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): second_tokens = self.tokenize(second_text) else: second_tokens = second_text if maxlen is not None: if truncate_from == 'right': index = -int(self._token_end is not None) - 1 elif truncate_from == 'left': index = int(self._token_start is not None) else: index = truncate_from if second_text is not None and pattern == 'S*E*E': maxlen += 1 truncate_sequences(maxlen, index, first_tokens, second_tokens) first_token_ids = self.tokens_to_ids(first_tokens) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: if pattern == 'S*E*E': idx = int(bool(self._token_start)) second_tokens = second_tokens[idx:] second_token_ids = self.tokens_to_ids(second_tokens) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def encode(self, first_text, second_text=None, maxlen=None, pattern='S*E*E'): """输出文本对应token id和segment id """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): if pattern == 'S*E*E': idx = int(bool(self._token_start)) second_tokens = self.tokenize(second_text)[idx:] elif pattern == 'S*ES*E': second_tokens = self.tokenize(second_text) else: second_tokens = second_text if maxlen is not None: index = int(self._token_end is not None) + 1 truncate_sequences(maxlen, -index, first_tokens, second_tokens) first_token_ids = self.tokens_to_ids(first_tokens) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: second_token_ids = self.tokens_to_ids(second_tokens) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids