def __init__(self): self.history_savedir = None self.detail_savedir = None self.logger = None self.user = self.create_user() self.rule_policy = RulePolicy() self.dst = DialogStateTracker(UserPersonal, FLAGS.print, self.logger) self.data_manager = DataManager(os.path.join(BASE_DIR, 'data/tmp')) self.nlu_manager = NLUManager(NLU_save_path_dict) # self.nlg_template = NLG_template self.turn_num = 1 self.dialog_history = []
def feature_processing(self, filename, trip=1, workers=8): """Read the data records and create data features """ accelerometer_data = dm.trip_data_to_df(filename, trip=trip) segments = self.split_segments(accelerometer_data) dim = ceil(len(segments) / workers) chunks = (segments[k:k + dim] for k in range(0, len(segments), dim)) with ProcessPoolExecutor(max_workers=workers) as executor: futures = [ executor.submit(self.features, chunk) for chunk in chunks ] segment_features = [] for future in futures: segment_features.append(future.result()) segments_df = pd.concat(segment_features, ignore_index=True) segments_df = segments_df.sort_values('Time') # Average features to 1 minute intervals segments = self.split_segments( segments_df, time_intervals=self.AVG_FEATURES_INTERVALS) dim = ceil(len(segments) / workers) chunks = (segments[k:k + dim] for k in range(0, len(segments), dim)) with ProcessPoolExecutor(max_workers=workers) as executor: futures = [ executor.submit(self.average_features, chunk) for chunk in chunks ] average_features = [] for future in futures: average_features.append(future.result()) avg_features_df = pd.concat(average_features, ignore_index=True) avg_features_df = avg_features_df.sort_values('Time') return avg_features_df
class OnlineStoreApp(App): dataManager = DataManager() ordersList = ListProperty([]) def app_func(self): '''Wrapper functions for the async processes. ''' def run_wrapper(): # Run the Kivy UI self.async_run() exit(0) return asyncio.gather(run_wrapper()) def __init__(self, **kwargs): super().__init__(**kwargs) def build(self): return OrderScreen()
def get_joined_trips_data(self, trips): joineds = [] for trip in trips: print(f"processing trip {trip}") files = ['Pixel_accelerometer', f'Pixel_gyro_{trip}'] results = [] for file in files: result = self.feature_processing(file, trip=trip) file = self.clean_file_name(file) result.rename(columns={ 'msm': f'msm_{file}', 'variance': f'variance_{file}' }, inplace=True) results.append(result) # results = [] # for future in futures: # results.append(future.result()) pre_result = pd.concat(results, axis=1) result = self.remove_dup_columns(pre_result) # print(result) data = dm.trip_data_to_df("Pixel_activity", trip=trip) # Average features to 1 minute intervals data_segments = self.split_segments( data, time_intervals=self.AVG_FEATURES_INTERVALS) avg_classification_df = self.average_classification(data_segments) joined_df = self.join_sensors_classification_data( result, avg_classification_df) joineds.append(joined_df) return joineds
def setUp(self): removeLeftOverDbFiles() DataManagerUnitTest.dm = DataManager( configObj=DataManagerUnitTest.testConfig)
if 'chatting' in SysAct.keys(): strategy = random.choice([0,1,2]) if strategy == 0: nl += '您好,我是中国移动业务咨询机器人,可以帮助您进行套餐查找、业务咨询、个人信息查询等,请问您需要什么帮助吗?' elif strategy == 1: nl += '抱歉,您说的这些我不太理解,但我可以在套餐查找、业务咨询、个人信息查询等方面为您提供帮助的~' else: nl += '请不要再调戏我了,问一些套餐查找、业务咨询、个人信息查询方面的问题吧' # if 'repeat' in SysAct.keys(): # nl += "抱歉,能再重述一遍么?" return nl if __name__ == '__main__': data_manager = DataManager('../data/tmp') offer_entity = data_manager.SearchingByConstraints('套餐', {"功能费": [700, 900]})[0] compared_entities = data_manager.SearchingByConstraints('套餐', {"功能费": [700, 900]}) domain = '套餐' SysAct = { 'ex0': { 'offer': offer_entity, 'inform':['产品介绍', '套餐内容_国内主叫', '套餐内容_国内流量', '套餐内容_国内短信', '结转规则', '是否包含港澳台地区']}, 'ex1': #self.UsrAct == "告知": {'offer': offer_entity, 'inform': ["产品介绍"], 'reqmore': None, # None 是因为 reqmore 没有参数 'domain':domain}, 'ex2': {'request': ["套餐内容_国内流量", "套餐内容_国内主叫"], 'domain': domain},
def train_requestable(data_tmp_path): """ 用于训练模型,先训练完存好了才能用 训练用 early stopping :param data_tmp_path: data tmp 文件夹位置 """ print('载入数据管理器...') data_manager = DataManager(data_tmp_path) print('载入训练数据...') informable_slot_datasets, requestable_slot_datasets = generate_dataset( data_manager.DialogData) print('载入 requestable slot detector...') init_learning_rate = 0.005 graph = tf.Graph() with graph.as_default(): requestable_slots_models = {} for k, v in All_requestable_slots_order.items(): requestable_slots_models[k] = RequestableSlotDector( str(v), learning_rate=init_learning_rate) with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tf.group(tf.global_variables_initializer())) saver = tf.train.Saver() # saver.restore(sess, "./ckpt/requestable/model.ckpt") # 训练 requestable slots requestable_slots_F1s = {} for slot, model in requestable_slots_models.items(): average_loss = 0 learning_rate = init_learning_rate best_F1 = 0 tolerance = 30 tolerance_count = 0 display_step = 10 for step in range(5000): step += 1 batch_data, batch_output = requestable_slot_datasets[ slot].next_batch() char_emb_matrix, word_emb_matrix, seqlen = data_manager.sent2num( batch_data) _, training_loss = sess.run( [model.train_op, model.final_loss], feed_dict={ model.char_emb_matrix: char_emb_matrix, model.word_emb_matrix: word_emb_matrix, model.output: batch_output }) average_loss += training_loss / display_step if step % display_step == 0: batch_data, batch_output = requestable_slot_datasets[ slot].get_testset() char_emb_matrix, word_emb_matrix, seqlen = data_manager.sent2num( batch_data) pred, accu = sess.run( [model.predict, model.accuracy], feed_dict={ model.char_emb_matrix: char_emb_matrix, model.word_emb_matrix: word_emb_matrix, model.output: batch_output }) F1 = get_F1score(batch_output, pred.tolist()) if best_F1 < F1: best_F1 = F1 tolerance_count = 0 if not os.path.exists("./ckpt/requestable/"): os.makedirs("./ckpt/requestable/") saver.save(sess, "./ckpt/requestable/model.ckpt") if tolerance_count == tolerance: break print("%s, step % 4d, loss %0.4f, F1 %0.4f, accu %0.4f" % (slot, step, average_loss, F1, accu)) average_loss = 0 tolerance_count += 1 learning_rate = max(learning_rate * 0.98, 0.001) sess.run(model.update_lr, feed_dict={model.new_lr: learning_rate}) print("requestable slot: %s, best F1 %0.4f" % (slot, best_F1)) requestable_slots_F1s[slot] = best_F1 print(requestable_slots_F1s) print( sum(requestable_slots_F1s.values()) / len(requestable_slots_F1s.values()))
def train_informable(data_tmp_path): """ 用于训练模型,先训练完存好了才能用 训练用 early stopping :param data_tmp_path: data tmp 文件夹位置 """ print('载入数据管理器...') data_manager = DataManager(data_tmp_path) print('载入训练数据...') informable_slot_datasets, requestable_slot_datasets = generate_dataset( data_manager.DialogData) print('载入 informable slot detector ...') init_learning_rate = 0.005 informable_batch_ratios = { # 不同slot 的minibatch ratio "通话时长": [2, 8, 8, 8], "流量": [4, 8, 8, 8], "功能费": [4, 8, 8, 8] } graph = tf.Graph() with graph.as_default(): informable_slots_models = { "功能费": InformableSlotDector('cost', learning_rate=init_learning_rate), "流量": InformableSlotDector('data', learning_rate=init_learning_rate), "通话时长": InformableSlotDector('time', learning_rate=init_learning_rate), } with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tf.group(tf.global_variables_initializer())) saver = tf.train.Saver() # saver.restore(sess, "./ckpt/informable/model.ckpt") # 训练 informable slots informable_slots_accus = [] for slot, model in informable_slots_models.items(): learning_rate = init_learning_rate average_loss = 0 best_accu = 0 tolerance = 20 tolerance_count = 0 display_step = 10 for step in range(5000): step += 1 batch_data, batch_output = informable_slot_datasets[ slot].next_batch(informable_batch_ratios[slot]) char_emb_matrix, word_emb_matrix, seqlen = data_manager.sent2num( batch_data) _, training_loss = sess.run( [model.train_op, model.final_loss], feed_dict={ model.char_emb_matrix: char_emb_matrix, model.word_emb_matrix: word_emb_matrix, model.output: batch_output }) average_loss += training_loss / display_step if step % display_step == 0: batch_data, batch_output = informable_slot_datasets[ slot].get_testset() char_emb_matrix, word_emb_matrix, seqlen = data_manager.sent2num( batch_data) pred, accu = sess.run( [model.predict, model.accuracy], feed_dict={ model.char_emb_matrix: char_emb_matrix, model.word_emb_matrix: word_emb_matrix, model.output: batch_output }) if best_accu < accu: best_accu = accu tolerance_count = 0 if not os.path.exists("./ckpt/informable/"): os.makedirs("./ckpt/informable/") saver.save(sess, "./ckpt/informable/model.ckpt") if tolerance_count == tolerance: break print("%s, step % 4d, loss %0.4f, accu %0.4f" % (slot, step, average_loss, accu)) average_loss = 0 tolerance_count += 1 learning_rate = max(learning_rate * 0.95, 0.0001) sess.run(model.update_lr, feed_dict={model.new_lr: learning_rate}) print("informable slot: %s, best accu %0.4f" % (slot, best_accu)) informable_slots_accus.append(best_accu)
batch_output = [] domain_id = domains.index(domain) batch_output += [domain_id] * len(batch_data) return batch_data, batch_output, end_flag def generate_domain_dataset(DialogData, domain_data_ids): dialog_data = copy.deepcopy(DialogData) domain_datas = {} for domain, data_ids in domain_data_ids.items(): domain_datas[domain] = [] for data_id in data_ids: domain_datas[domain] += dialog_data[data_id]["用户回复示例"] domain_dataset = DomainDataset(domain_datas['个人'], domain_datas['套餐'], domain_datas['流量'], domain_datas['WLAN'], domain_datas['号卡'], domain_datas['国际港澳台'], domain_datas['家庭多终端']) return domain_dataset if __name__ == '__main__': data_manager = DataManager('../../../data/tmp') domain_dataset = generate_domain_dataset(data_manager.DialogData, domain_data_ids) binput, boutput = domain_dataset.next_batch() pprint.pprint(domain_dataset.next_batch()) binput, boutput = domain_dataset.next_batch() pprint.pprint(domain_dataset.next_batch())
from datetime import datetime from data.DataManager import DataManager from structures.NeuralNetwork import NeuralNetwork from structures.functions import sigmoid, d_sigmoid, same, d_same from structures.layers.DenseLayer import DenseLayer def classify(arr): a, b = max((x, y) for y, x in enumerate(arr)) return b if __name__ == '__main__': dm = DataManager('mnist.pkl.gz') nn = NeuralNetwork() nn.add_layer(DenseLayer(784, 400, sigmoid, d_sigmoid)) nn.add_layer(DenseLayer(400, 150, sigmoid, d_sigmoid)) nn.add_layer(DenseLayer(150, 50, sigmoid, d_sigmoid)) nn.add_layer(DenseLayer(50, 10, same, d_same)) curtime = datetime.now() nn.train(dm.train_set, epochs=30, epoch_range=1) t = datetime.now() - curtime print('{} seconds for training'.format(t.seconds)) print('correct for {}% on train set'.format(nn.calculate_correct(dm.test_set))) results = [] for x, y in dm.test_set[:75]: res = nn.feed_forward(x) res = dm.classify(res)
self.batch_id[user_act] += test_batchsize batch_output = [] user_act_id = user_acts.index(user_act) batch_output += [user_act_id] * len(batch_data) return batch_data, batch_output, end_flag def generate_user_act_dataset(DialogData, user_act_data_ids): dialog_data = copy.deepcopy(DialogData) user_act_datas = {} for user_act, data_ids in user_act_data_ids.items(): if user_act in ['问询费用选项', '问询通话时长选项', '问询流量选项']: user_act = '问询说明' if user_act not in user_act_datas.keys(): user_act_datas[user_act] = [] for data_id in data_ids: user_act_datas[user_act] += dialog_data[data_id]["用户回复示例"] user_act_dataset = UserActDataset(user_act_datas) return user_act_dataset if __name__ == '__main__': data_manager = DataManager(os.path.join(BASE_DIR, '../../../data/tmp')) user_act_dataset = generate_user_act_dataset(data_manager.DialogData, user_act_data_ids) for user_act, data in user_act_dataset.data.items(): print(user_act) print(len(data))
class PassageFetcher(object): DATA_MANAGER = DataManager() LINE_CACHE_SIZE = 4 LOOK_FORWARD_SIZE = 4 TAG_LIMIT = 4 NEGATORY_QUOTE = "'Remember not only to say the right thing in the right place, but far more difficult still, to leave unsaid the wrong thing at the tempting moment.' - Benjamin Franklin" @staticmethod def FetchAndCompilePassages(tags): print(tags) tags = [str(tag) for tag in tags] # Validate tags valid_tags = PassageFetcher.ValidateTags(tags) print(valid_tags) feed = valid_tags if len( valid_tags) < PassageFetcher.TAG_LIMIT else valid_tags[ 0:PassageFetcher.TAG_LIMIT] authors = [] # Get passages result = "\n\n".join(PassageFetcher.FetchPassages(feed, authors)) if result == None or result.isspace(): return PassageFetcher.NEGATORY_QUOTE else: return tuple([result, feed, authors]) @staticmethod def FetchPassages(tags, authors): return [PassageFetcher.PassagePipe(tag, authors) for tag in tags] @staticmethod def PassagePipe(tag, authors): locations = PassageFetcher.GetLocations(tag) if locations == None: return "" coordinates = PassageFetcher.GetRandomLocation(locations) try: authors.append(Novels[coordinates[0]]) except Exception as e: print(e) raw_passage = PassageFetcher.ExtractPassage(coordinates) treated_passage = PassageFetcher.TreatPassage(raw_passage) return treated_passage @staticmethod def ValidateTags(tags): ### ADD THE SORT HERE ### valid_tags = [] for tag in tags: try: PassageFetcher.DATA_MANAGER.DATA[tag.lower()] valid_tags.append(tag) except: print(tag + " was not found.") return valid_tags @staticmethod def GetLocations(tag): try: locations = PassageFetcher.DATA_MANAGER.DATA[tag.lower()] except Exception as e: print("Error: ") return None return locations @staticmethod def GetRandomLocation(locations): return random.choice(locations) @staticmethod def ExtractPassage(coordinates): line_index = 0 word_index = 0 # Open up the book with open("./data/books/" + coordinates[0]) as file: # Compile regex sentence pattern line_cache = [] found_flag = False local_timer = PassageFetcher.LOOK_FORWARD_SIZE # Find line for line in file: words = line.split() # Find word for word in words: # If the word is found, compile the passage response if word_index == coordinates[2]: found_flag = True word_index += 1 line_index += 1 if line != '\r\n' and line != '\n' and line != '\r': line_cache.append(line) if len( line_cache ) > PassageFetcher.LINE_CACHE_SIZE and not found_flag: line_cache.pop(0) if found_flag: local_timer -= 1 if local_timer <= 0: break return "".join(line_cache) @staticmethod def TreatPassage(raw_passage): pattern = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M) sentences = pattern.findall(raw_passage) if len(sentences) > 3: sentences.pop(0) sentences.pop(len(sentences) - 1) return " ".join(sentences)