def init_data(path, feature_names, vocs, max_len,word_len,word_id, model='train', sep='\t'): """ 加载数据 Args: path: str, 数据路径 feature_names: list of str, 特征名称 vocs: list of dict max_len: int, 句子最大长度 word_len: int, 单词最大 model: str, in ('train', 'test') sep: str, 特征之间的分割符, default is '\t' Returns: data_dict: dict """ # print(vocs) assert model in ('train', 'test') file_r = codecs.open(path, 'r', encoding='utf-8') sentences = file_r.read().strip().split('\n\n') print(sentences) sentence_count = len(sentences) print(sentence_count) feature_count = len(feature_names) data_dict = dict() for feature_name in feature_names: data_dict[feature_name] = np.zeros( (sentence_count, max_len), dtype='int32') data_dict['char']=np.zeros((len(sentences),max_len,word_len),dtype='int32') if model == 'train': data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32') for index, sentence in enumerate(sentences): items = sentence.split('\n') one_instance_items = [] char_instance_item=[] [one_instance_items.append([]) for _ in range(len(feature_names)+1)] # 申请字典空间 [char_instance_item.append([]) for _ in range(len(items))] for item_num,item in enumerate(items): feature_tokens = item.split(sep) # print(feature_tokens) for j in range(feature_count): one_instance_items[j].append(feature_tokens[j]) if j==word_id-1: for num,w in enumerate(feature_tokens[j]): if num==word_len:break char_instance_item[item_num].append(w) if model == 'train': one_instance_items[-1].append(feature_tokens[-1]) # print(one_instance_items) # print(char_instance_item) for i in range(len(feature_names)): data_dict[feature_names[i]][index, :] = map_item2id( one_instance_items[i], vocs[i], max_len) for i in range(len(items)): data_dict['char'][index][i]=map_item2id(char_instance_item[i],vocs[-1],word_len) if model == 'train': data_dict['label'][index, :] = map_item2id( one_instance_items[-1], vocs[-2], max_len) file_r.close() # print(data_dict) return data_dict
def init_data(path, feature_names, vocs, max_len, model='train', use_char_feature=False, word_len=None, sep='\t'): """ 加载数据(待优化,目前是一次性加载整个数据集) Args: path: str, 数据路径 feature_names: list of str, 特征名称 vocs: list of dict max_len: int, 句子最大长度 model: str, in ('train', 'test') use_char_feature: bool,是否使用char特征 word_len: None or int,单词最大长度 sep: str, 特征之间的分割符, default is '\t' Returns: data_dict: dict """ assert model in ('train', 'test') file_r = codecs.open(path, 'r', encoding='utf-8') sentences = file_r.read().strip().split('\r\n\r\n') sentence_count = len(sentences) feature_count = len(feature_names) data_dict = dict() for feature_name in feature_names: data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32') # char feature if use_char_feature: data_dict['char'] = np.zeros( (sentence_count, max_len, word_len), dtype='int32') char_voc = vocs.pop(0) if model == 'train': data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32') for index, sentence in enumerate(sentences): items = sentence.split('\r\n') one_instance_items = [] [one_instance_items.append([]) for _ in range(len(feature_names)+1)] for item in items: feature_tokens = item.split(sep) for j in range(feature_count): one_instance_items[j].append(feature_tokens[j]) if model == 'train': one_instance_items[-1].append(feature_tokens[-1]) for i in range(len(feature_names)): data_dict[feature_names[i]][index, :] = map_item2id( one_instance_items[i], vocs[i], max_len) if use_char_feature: for i, word in enumerate(one_instance_items[0]): if i >= max_len: break data_dict['char'][index][i, :] = map_item2id( word, char_voc, word_len) if model == 'train': data_dict['label'][index, :] = map_item2id( one_instance_items[-1], vocs[-1], max_len) sys.stdout.write('loading data: %d\r' % index) file_r.close() return data_dict
def init_data(path, feature_names, vocs, max_len, model='train', sep='\t'): """ 加载数据(待优化,目前是一次性加载整个数据集) Args: path: str, 数据路径 feature_names: list of str, 特征名称 vocs: list of dict max_len: int, 句子最大长度 model: str, in ('train', 'test') sep: str, 特征之间的分割符, default is '\t' Returns: data_dict: dict """ assert model in ('train', 'test') file_r = codecs.open(path, 'r', encoding='utf-8') sentences = file_r.read().strip().split('\n\n') sentence_count = len(sentences) feature_count = len(feature_names) data_dict = dict() for feature_name in feature_names: data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32') if model == 'train': data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32') for index, sentence in enumerate(sentences): items = sentence.split('\n') one_instance_items = [] [one_instance_items.append([]) for _ in range(len(feature_names) + 1)] for item in items: feature_tokens = item.split(sep) for j in range(feature_count): one_instance_items[j].append(feature_tokens[j]) if model == 'train': one_instance_items[-1].append(feature_tokens[-1]) for i in range(len(feature_names)): data_dict[feature_names[i]][index, :] = map_item2id( one_instance_items[i], vocs[i], max_len) if model == 'train': data_dict['label'][index, :] = map_item2id(one_instance_items[-1], vocs[-1], max_len) file_r.close() return data_dict
def load_session_infer_data(path, feature_names, vocs, max_len, model='test'): assert model in ['train', 'test'] assert model == 'test' fr = open(path, 'r', encoding='utf-8') samples = fr.read().strip().split('\n\n') print('number of samples', len(samples)) data_dict = collections.OrderedDict() for i, sample in enumerate(samples): sentences = sample.split('\n') ss = sentences[0].split('\t') assert len(ss) == 3 sid = ss[0] intent = None feat_dict = {} for feature_name in feature_names: feat_dict[feature_name] = [] slot = [] for sentence in sentences[1:]: ss = sentence.split('\t') for i, feat_name in enumerate(feature_names): feat_dict[feat_name].append(ss[i]) if model == 'train': slot += [ss[-1]] if sid not in data_dict: data_dict[sid] = [] data_dict[sid].append((intent, slot, feat_dict)) # index all features max_turn = max([len(data_dict[x]) for x in data_dict]) print('number of sessions', len(data_dict)) print('max turn of sessions', max_turn) idx_dict = collections.OrderedDict() for sid in data_dict: session_list = data_dict[sid] session_x = [] for label, slot, feat_dict in session_list: feat_idx_dict = dict() for i, feat_name in enumerate(feature_names): feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name], vocs[i], max_len) session_x += [[feat_idx_dict, None, None]] idx_dict[sid] = session_x return idx_dict
def init_data(feature_names, vocs, max_len, model='train', path=None, test_sens=None, use_char_feature=False, word_len=None, sep='\t'): """ 加载数据(待优化,目前是一次性加载整个数据集) Args: path: str, 数据路径 test_sens: list, [[[u'白带常规', u 'ni', u 'S_ex_name'],[u ':', u 'w', u 'O'],[],...],[[],[],[],...],...] feature_names: list of str, 特征名称 vocs: list of dict max_len: int, 句子最大长度 model: str, in ('train', 'test') use_char_feature: bool,是否使用char特征 word_len: None or int,单词最大长度 sep: str, 特征之间的分割符, default is '\t' Returns: data_dict: dict """ assert model in ('train', 'test') if model == 'train': with codecs.open(path, 'r', encoding='utf8') as file_r: sentences = file_r.read().strip().split('\n\n') sentences = [[j.split(sep) for j in sen.split('\n')] for sen in sentences] else: if not test_sens: raise ValueError('请保证测试语料非空!!!') sentences = test_sens sentences_count = len(sentences) print('sentences_count1', sentences_count) feature_count = len(feature_names) data_dict = dict() for feature_name in feature_names: data_dict[feature_name] = np.zeros((sentences_count, max_len), dtype='int32') #char feature if use_char_feature: data_dict['char'] = np.zeros((sentences_count, max_len, word_len), dtype='int32') char_voc = vocs.pop(0) if model == 'train': data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32') for index, items in enumerate(sentences): one_instance_items = [] [one_instance_items.append([]) for _ in range(len(feature_names) + 1)] for feature_tokens in items: for j in range(feature_count): one_instance_items[j].append(feature_tokens[j]) if model == 'train': one_instance_items[-1].append(feature_tokens[-1]) for i in range(len(feature_names)): data_dict[feature_names[i]][index, :] = map_item2id( one_instance_items[i], vocs[i], max_len) if use_char_feature: for i, word in enumerate(one_instance_items[0]): if i >= max_len: break data_dict['char'][index][i, :] = map_item2id( word, char_voc, word_len) if model == 'train': data_dict['label'][index, :] = map_item2id(one_instance_items[-1], vocs[-1], max_len) print('loading data: %d\r' % index) return data_dict
def predict(self, query): items = [c for c in query] data_dict = {'f1': np.zeros((1, self.max_len))} data_dict['f1'][0, :] = map_item2id(items, self.vocs[0], self.max_len) res = self.model.predict(data_dict) return self.label_voc[res[0][0] + 1]
def init_data(path, feature_names, vocs, max_len, model='train', use_char_feature=False, word_len=None, sep='\t'): """ 加载数据(待优化,目前是一次性加载整个数据集) Args: path: str, 数据路径 feature_names: list of str, 特征名称 vocs: list of dict max_len: int, 句子最大长度 model: str, in ('train', 'test') use_char_feature: bool,是否使用char特征 word_len: None or int,单词最大长度 sep: str, 特征之间的分割符, default is '\t' Returns: data_dict: dict """ assert model in ('train', 'test') # 限定数据集类型,训练集或测试集 file_r = codecs.open(path, 'r', encoding='utf-8') sentences = file_r.read().strip().split('\n\n') # 将训练集切为以句子为单位的列表 sentence_count = len(sentences) # 句子数 print "sentence number:", sentence_count feature_count = len(feature_names) # 初始化数据集,data_dict存放不同feature和trainset标签的矩阵 data_dict = dict() for feature_name in feature_names: data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32') # char feature if use_char_feature: data_dict['char'] = np.zeros( (sentence_count, max_len, word_len), dtype='int32') char_voc = vocs.pop(0) if model == 'train': # 训练集包含标签 data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32') for index, sentence in enumerate(sentences): items = sentence.split('\n') # 取句子的元素(特征,标签) one_instance_items = [] # 用以分开存放一个句子的特征向量(list),和标签向量(list) [one_instance_items.append([]) for _ in range(len(feature_names)+1)] for item in items: if item ==u"": continue feature_tokens = item.split(sep) # 根据数据集中间隔符,将特征和标签分开 for j in range(feature_count): one_instance_items[j].append(feature_tokens[j]) if model == 'train': one_instance_items[-1].append(feature_tokens[-1]) for i in range(len(feature_names)): # print data_dict[feature_names[i]][index] # 将数据集中各特证和标签,转化为int型id data_dict[feature_names[i]][index, :] = map_item2id( one_instance_items[i], vocs[i], max_len) # print data_dict[feature_names[i]][index] if use_char_feature: for i, word in enumerate(one_instance_items[0]): if i >= max_len: break data_dict['char'][index][i, :] = map_item2id( word, char_voc, word_len) if model == 'train': data_dict['label'][index, :] = map_item2id( one_instance_items[-1], vocs[-1], max_len) sys.stdout.write('loading data: %d\r' % index) file_r.close() return data_dict
def load_session_data(path, feature_names, vocs, max_len, model='train'): assert model in ['train', 'test'] fr = open(path, 'r', encoding='utf-8') samples = fr.read().strip().split('\n\n') print('number of samples', len(samples)) data_dict = collections.defaultdict(list) for i, sample in enumerate(samples): sentences = sample.split('\n') ss = sentences[0].split('\t') if model == 'train': assert len(ss) == 4 else: assert len(ss) == 3 sid = ss[0] intent = None if model == 'train': intent = ss[3] feat_dict = {} for feature_name in feature_names: feat_dict[feature_name] = [] slot = [] for sentence in sentences[1:]: ss = sentence.split('\t') for i, feat_name in enumerate(feature_names): feat_dict[feat_name].append(ss[i]) if model == 'train': slot += [ss[-1]] # data_dict[sid].append((intent, slot, feat_dict)) data_dict[sid] = (intent, slot, feat_dict) # index all features # max_turn = max([len(data_dict[x]) for x in data_dict]) print('number of sessions', len(data_dict)) # print('max turn of sessions', max_turn) idx_dict = dict() for sid in data_dict: session_list = data_dict[sid] # session_x = [] # for label, slot, feat_dict in session_list: # label_idx = INTENT_DIC.get(label, 0) # slot_idx = map_item2id(slot, vocs[-1], max_len) # length = len(slot) # feat_idx_dict = dict() # for i, feat_name in enumerate(feature_names): # assert length == len(feat_dict[feat_name]) # feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name], vocs[i], max_len) # session_x += [[feat_idx_dict, label_idx, slot_idx]] # idx_dict[sid] = session_x label, slot, feat_dict = session_list label_idx = INTENT_DIC.get(label, 0) slot_idx = map_item2id(slot, vocs[-1], max_len) length = len(slot) feat_idx_dict = dict() for i, feat_name in enumerate(feature_names): # assert length == len(feat_dict[feat_name]) feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name], vocs[i], max_len) session_x = [feat_idx_dict, label_idx, slot_idx] idx_dict[sid] = session_x return idx_dict