def dump_latent(model, data_feed, config, dest_f, num_batch=1): model.eval() de_tknize = utils.get_dekenize() data_feed.epoch_init(config, verbose=False, shuffle=False) logger.info("Dumping: {} batches".format(data_feed.num_batch if num_batch is None else num_batch)) all_zs = [] all_labels = [] all_metas = [] while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break results = model(batch, mode=TEACH_FORCE, return_latent=True) labels = batch.outputs metas = batch.metas log_qy = results.log_qy.cpu().squeeze(0).data y_ids = results.y_ids.cpu().data dec_init = results.dec_init_state.cpu().squeeze().data for b_id in range(labels.shape[0]): true_str, _ = engine.get_sent(model, de_tknize, labels, b_id) all_labels.append(true_str) all_metas.append(metas[b_id]) all_zs.append((log_qy.numpy(), dec_init.numpy(), y_ids.numpy())) pickle.dump({'z': all_zs, 'labels': all_labels, "metas": all_metas}, dest_f) logger.info("Dumping Done")
def sweep(model, data_feed, config, num_batch=1, dest_f=None): model.eval() old_batch_size = config.batch_size if num_batch != None: config.batch_size = 10 de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) config.batch_size = old_batch_size logger.info("Generation: {} batches".format( data_feed.num_batch if num_batch is None else num_batch)) def write(msg): if dest_f is None: logger.info(msg) else: dest_f.write(msg + '\n') while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels, all_y_ids = model.sweep(batch, gen_type=config.gen_type) # move from GPU to CPU true_labels = labels.cpu().data.numpy() all_y_ids = all_y_ids.cpu().data.numpy() pred_labels = [ t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE] ] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0, 1) # get attention if possible pred_attns = None true_str, _ = engine.get_sent(model, de_tknize, true_labels, 0) write("Start: {}".format(true_str)) prev_code = None for b_id in range(pred_labels.shape[0]): pred_str, attn = engine.get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns) code = '-'.join(map(str, all_y_ids[b_id])) if prev_code != code: write("Predict ({}): {}".format(code[:10], pred_str)) prev_code = code true_str, _ = engine.get_sent(model, de_tknize, true_labels, true_labels.shape[0] - 1) write("End: {}\n".format(true_str)) logger.info("Generation Done")
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None): model.eval() old_batch_size = config.batch_size if num_batch != None: config.batch_size = 5 de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) config.batch_size = old_batch_size evaluator.initialize() logger.info("Generation: {} batches".format(data_feed.num_batch if num_batch is None else num_batch)) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) # move from GPU to CPU pred_labels = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0,1) true_labels = labels.cpu().data.numpy() # get attention if possible if config.use_attn: pred_attns = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_ATTN_SCORE]] pred_attns = np.array(pred_attns, dtype=float).squeeze(2).swapaxes(0,1) else: pred_attns = None ctx = batch.get('contexts') ctx_size = ctx.shape[1] for b_id in range(pred_labels.shape[0]): pred_str, attn = engine.get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns) ctx_str = [] for i in range(ctx_size): temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, 1:], b_id) if temp: ctx_str.append(temp) ctx_str = '<t>'.join(ctx_str) true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id) evaluator.add_example(true_str, pred_str) if dest_f is None: logger.info("Source: {}".format(ctx_str)) logger.info("Target: {}".format(true_str)) logger.info("Predict: {}\n".format(pred_str)) else: dest_f.write("Source: {}\n".format(ctx_str)) dest_f.write("Target: {}\n".format(true_str)) dest_f.write("Predict: {}\n\n".format(pred_str)) if dest_f is None: logging.info(evaluator.get_report(include_error=dest_f is not None)) else: dest_f.write(evaluator.get_report(include_error=dest_f is not None)) logger.info("Generation Done")
def selective_generate(model, data_feed, config, selected_clusters): model.eval() de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) # get all code codes = set([d['code'] for d in selected_clusters]) logger.info("Generation: {} batches".format(data_feed.num_batch)) data = [] total_cnt = 0.0 in_cnt = 0.0 while True: batch = data_feed.next_batch() if batch is None: break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) # move from GPU to CPU pred_labels = [ t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE] ] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0, 1) true_labels = labels.cpu().data.numpy() y_ids = outputs[DecoderRNN.KEY_LATENT].cpu().data.numpy() y_ids = y_ids.reshape(-1, config.y_size) ctx = batch.get('contexts') ctx_size = ctx.shape[1] for b_id in range(pred_labels.shape[0]): y_id = map(str, y_ids[b_id]) code = '-'.join(y_id) total_cnt += 1 if code in codes: pred_str, attn = engine.get_sent(model, de_tknize, pred_labels, b_id, attn=None) ctx_str = [] for i in range(ctx_size): temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, 1:], b_id) ctx_str.append(temp) ctx_str = '<t>'.join(ctx_str) true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id) in_cnt += 1 data.append({ 'context': ctx_str, 'target': true_str, 'predict': pred_str, 'code': code }) logger.info("In rate {}".format(in_cnt / total_cnt)) return data
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None): model.eval() old_batch_size = config.batch_size if num_batch != None: config.batch_size = 3 de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) config.batch_size = old_batch_size evaluator.initialize() logger.info("Generation: {} batches".format( data_feed.num_batch if num_batch is None else num_batch)) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) # move from GPU to CPU labels = labels.cpu() pred_labels = [ t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE] ] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0, 1) true_labels = labels.data.numpy() # get attention if possible pred_attns = None for b_id in range(pred_labels.shape[0]): pred_str, attn = engine.get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns) true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id) evaluator.add_example(true_str, pred_str) if dest_f is None: logger.info("Target: {}".format(true_str)) logger.info("Predict: {}\n".format(pred_str)) else: # dest_f.write("Target: {}\n".format(true_str)) # dest_f.write("Predict: {}\n\n".format(pred_str)) dest_f.write("Target: {}\n".format(true_str).encode()) dest_f.write("Predict: {}\n\n".format(pred_str).encode()) if dest_f is None: logging.info(evaluator.get_report(include_error=dest_f is not None)) else: # dest_f.write(evaluator.get_report(include_error=dest_f is not None)) dest_f.write( evaluator.get_report(include_error=dest_f is not None).encode()) logger.info("Generation Done")
def gen_with_vae(model, data_feed, config, num_batch=1, dest_f=None): model.eval() old_batch_size = config.batch_size if num_batch != None: config.batch_size = 3 de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) logger.info("Generation: {} batches".format( data_feed.num_batch if num_batch is None else num_batch)) print_cnt = 0 sample_n = 5 def write(msg): if dest_f is None: logger.info(msg) else: dest_f.write(msg + '\n') while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break ctx = batch.get('contexts') ctx_size = ctx.shape[1] sample_outputs, _ = model(batch, mode=GEN, gen_type="sample", sample_n=sample_n) greedy_outputs, labels = model(batch, mode=GEN, gen_type="greedy", sample_n=sample_n) # move from GPU to CPU labels = labels.cpu() sample_labels = [ t.cpu().data.numpy() for t in sample_outputs[DecoderRNN.KEY_SEQUENCE] ] greedy_labels = [ t.cpu().data.numpy() for t in greedy_outputs[DecoderRNN.KEY_SEQUENCE] ] sample_labels = np.array(sample_labels, dtype=int).squeeze(-1).swapaxes(0, 1) greedy_labels = np.array(greedy_labels, dtype=int).squeeze(-1).swapaxes(0, 1) true_labels = labels.data.numpy() for b_id in range(true_labels.shape[0]): ctx_str = [] for i in range(ctx_size): temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, :], b_id) if temp: ctx_str.append(temp) ctx_str = '<t>'.join(ctx_str) true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id) print_cnt += 1 write("Source: {}".format(ctx_str)) write("Target: {}".format(true_str)) for n_id in range(sample_n): pred_str, attn = engine.get_sent( model, de_tknize, greedy_labels, b_id + config.batch_size * n_id) write("Sample Z: {}".format(pred_str)) for n_id in range(sample_n): pred_str, attn = engine.get_sent( model, de_tknize, sample_labels, b_id + config.batch_size * n_id) write("Sample W: {}".format(pred_str)) write('\n') config.batch_size = old_batch_size logger.info("Generation Done\n")
def find_mi(model, data_feed, config): model.eval() de_tknize = utils.get_dekenize() data_feed.epoch_init(config, verbose=False, shuffle=False) logger.info("Find MI for: {} batches".format(data_feed.num_batch)) all_codes = [] all_metas = [] meta_keys = set() def write(msg): logger.info(msg) def code2id(code, base): idx = 0 for c_id, c in enumerate(code): idx += int(c) * np.power(base, c_id) return idx while True: batch = data_feed.next_batch() if batch is None: break results = model(batch, mode=TEACH_FORCE, return_latent=True) labels = batch.outputs metas = batch.metas for key in metas[0].keys(): meta_keys.add(key) log_qy = results.log_qy.view(-1, config.y_size, config.k) qy = torch.exp(log_qy) qy = qy.cpu().data.numpy() y_ids = results.y_ids.cpu().data.numpy() for b_id in range(labels.shape[0]): true_str, _ = engine.get_sent(model, de_tknize, labels, b_id) code = [] for y_id in range(config.y_size): for k_id in range(config.k): if qy[b_id, y_id, k_id] == np.max(qy[b_id, y_id]): code.append(str(k_id)) break #all_codes.append(code) all_codes.append(y_ids[b_id]) all_metas.append(metas[b_id]) vec_codes = np.array(all_codes).transpose(0, 1) vec_idxes = [code2id(c, config.k) for c in vec_codes] vec_vocabs = list(set(vec_idxes)) vec_idxes = [vec_vocabs.index(v) for v in vec_idxes] for key in meta_keys: # get all meta about this key meta_vals = [] for m in all_metas: if type(m[key]) is list: meta_vals.append(" ".join(map(str, m[key]))) elif type(m[key]) is dict: break else: meta_vals.append(m[key]) if not meta_vals: continue meta_vocab = list(set(meta_vals)) meta_vals = [meta_vocab.index(v) for v in meta_vals] mi = metrics.homogeneity_score(meta_vals, vec_idxes) write("{} mi with ID is {}".format(key, mi)) # individual dimension for y_id in range(config.y_size): mi = metrics.homogeneity_score(meta_vals, vec_codes[:, y_id]) write("{} mi with dim {} is {}".format(key, y_id, mi))
def latent_cluster(model, data_feed, config, cluster_name_id=None, action_count=0, num_batch=1, max_samples=5): if np.power(config.k, config.y_size) > 2000: logger.info("Skip latent cluster too many states") return model.eval() de_tknize = utils.get_dekenize() data_feed.epoch_init(config, verbose=False, shuffle=False) logger.info("Find cluster for: {} batches".format(data_feed.num_batch if num_batch is None else num_batch)) all_clusters = defaultdict(list) cond_y_matrix = np.zeros((config.k, config.k)) index_cluster_id = defaultdict(list) def write(msg): logger.info(msg) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break results = model(batch, mode=TEACH_FORCE, return_latent=True) labels = batch.outputs metas = batch.metas raw_index = batch.index log_qy = results.log_qy.view(-1, config.y_size, config.k) qy = torch.exp(log_qy) qy = qy.cpu().data.numpy() y_ids = results.y_ids.cpu().data.numpy() for b_id in range(labels.shape[0]): true_str, _ = engine.get_sent(model, de_tknize, labels, b_id) cond_y_matrix[y_ids[b_id]] += 1 code = [] for y_id in range(config.y_size): for k_id in range(config.k): if qy[b_id, y_id, k_id] == np.max(qy[b_id, y_id]): code.append(str(k_id)) break code = '-'.join(code) index_cluster_id[str(raw_index[b_id])] = code all_clusters[code].append((true_str, metas[b_id])) # show clusters keys = all_clusters.keys() keys = sorted(keys) logger.info("Find {} clusters".format(len(keys))) selected_clusters = [] if cluster_name_id is None: cluster_name_id = defaultdict(int) for symbol in keys: sents = all_clusters[symbol] if len(sents) < 1: write("Skip tiny cluster with {} utts - {}".format(len(sents), symbol)) continue if symbol not in cluster_name_id: cluster_name_id[symbol] = action_count action_count += 1 write("Symbol {}".format(symbol)) if len(sents) < max_samples: print("Find small cluster with {} utts".format(len(sents))) subset_ids = range(len(sents)) np.random.shuffle(subset_ids) else: subset_ids = np.random.choice(range(len(sents)), max_samples, replace=False) for s_id in subset_ids[0:5]: write(sents[s_id][0]) write("") selected_clusters.append({'code': symbol, 'meaning': '', 'examples': [sents[idx][0] for idx in subset_ids]}) logger.info("Find {} actions".format(action_count)) for sent in index_cluster_id.keys(): cluster_name = index_cluster_id[sent] if cluster_name in cluster_name_id: index_cluster_id[sent]=[cluster_name, cluster_name_id[cluster_name]] return selected_clusters, index_cluster_id, cluster_name_id, action_count
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None): model.eval() old_batch_size = config.batch_size if num_batch != None: config.batch_size = 5 de_tknize = utils.get_dekenize() data_feed.epoch_init(config, shuffle=False, verbose=False) config.batch_size = old_batch_size evaluator.initialize() logger.info("Generation: {} batches".format( data_feed.num_batch if num_batch is None else num_batch)) def write(msg): if dest_f is None: logger.info(msg) else: # dest_f.write(msg+'\n') dest_f.write(str(msg + '\n').encode()) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) prev_outputs, next_outputs = outputs prev_labels, next_labels = labels cur_labels = batch.get('outputs') prev_labels = prev_labels.cpu().data.numpy() next_labels = next_labels.cpu().data.numpy() prev_pred = [ t.cpu().data.numpy() for t in prev_outputs[DecoderRNN.KEY_SEQUENCE] ] prev_pred = np.array(prev_pred, dtype=int).squeeze(-1).swapaxes(0, 1) next_pred = [ t.cpu().data.numpy() for t in next_outputs[DecoderRNN.KEY_SEQUENCE] ] next_pred = np.array(next_pred, dtype=int).squeeze(-1).swapaxes(0, 1) for b_id in range(cur_labels.shape[0]): ctx_str, _ = engine.get_sent(model, de_tknize, cur_labels, b_id) prev_true_str, _ = engine.get_sent(model, de_tknize, prev_labels, b_id) next_true_str, _ = engine.get_sent(model, de_tknize, next_labels, b_id) pred_prev_str, _ = engine.get_sent(model, de_tknize, prev_pred, b_id) pred_next_str, _ = engine.get_sent(model, de_tknize, next_pred, b_id) evaluator.add_example(prev_true_str, pred_prev_str) evaluator.add_example(next_true_str, pred_next_str) write("Response: {}".format(ctx_str)) write("Prev Target: {}".format(prev_true_str)) write("Prev Predict: {}".format(pred_prev_str)) write("Next Target: {}".format(next_true_str)) write("Next Predict: {}\n".format(pred_next_str)) if dest_f is None: logging.info(evaluator.get_report(include_error=dest_f is not None)) else: # dest_f.write(evaluator.get_report(include_error=dest_f is not None)) dest_f.write( evaluator.get_report(include_error=dest_f is not None).encode()) logger.info("Generation Done")
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None): model.eval() de_tknize = get_dekenize() def write(msg): if msg is None or msg == '': return if dest_f is None: logger.info(msg) else: dest_f.write(msg + '\n') data_feed.epoch_init(config, shuffle=num_batch is not None, verbose=False) evaluator.initialize() logger.info("Generation: {} batches".format(data_feed.num_batch if num_batch is None else num_batch)) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) # move from GPU to CPU labels = labels.cpu() pred_labels = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0,1) true_labels = labels.data.numpy() # get attention if possible if config.use_attn or config.use_ptr: pred_attns = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_ATTN_SCORE]] pred_attns = np.array(pred_attns, dtype=float).squeeze(2).swapaxes(0,1) else: pred_attns = None # get last 1 context ctx = batch.get('contexts') ctx_len = batch.get('context_lens') domains = batch.domains # logger.info the batch in String. for b_id in range(pred_labels.shape[0]): pred_str, attn = get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns) true_str, _ = get_sent(model, de_tknize, true_labels, b_id) prev_ctx = "" if ctx is not None: ctx_str, _ = get_sent(model, de_tknize, ctx[:, ctx_len[b_id]-1, :], b_id) prev_ctx = "Source: {}".format(ctx_str) domain = domains[b_id] evaluator.add_example(true_str, pred_str, domain) if num_batch is None or num_batch <= 2: write(prev_ctx) write("{}:: True: {} ||| Pred: {}".format(domain, true_str, pred_str)) if attn: write("[[{}]]".format(attn)) write(evaluator.get_report(include_error=dest_f is not None)) logger.info("Generation Done")