def get_entity_dict(self, turn_corpus): utt2act = {} for msg in turn_corpus: utt2act[" ".join(msg.utt[1:-1])] = msg dekenize = get_dekenize() utt2act = {dekenize(k.split()): v for k, v in utt2act.items()} self.logger.info("Compress utt2act from {}->{}".format( len(turn_corpus), len(utt2act))) # get entity value vocabulary domain_id2ent = defaultdict(set) for utt, msg in utt2act.items(): for act in msg.actions: paras = act['parameters'] intent = act['act'] if intent == 'inform': for v in paras[0].values(): domain_id2ent[msg.domain].add(str(v)) elif intent == 'query': for v in paras[0].values(): domain_id2ent[msg.domain].add(v) else: for k, v in paras: if v: domain_id2ent[msg.domain].add(v) domain_id2ent = {k: list(v) for k, v in domain_id2ent.items()} return domain_id2ent
def dump_latent(model, data_feed, config, log_dir): model.eval() de_tknize = utils.get_dekenize() data_feed.epoch_init(config, verbose=False, shuffle=False) logger.info("Dumping: {} batches".format(data_feed.num_batch)) all_zs = [] all_metas = [] while True: batch = data_feed.next_batch() if batch is None: break results = model(batch, mode=TEACH_FORCE, return_latent=True) labels = batch.outputs domains = batch.domains acts = batch.get('output_actions') latent_acts = results.latent_actions if type(latent_acts) is tuple: latent_acts = list(latent_acts[0].cpu().data.numpy()) else: latent_acts = list(latent_acts.cpu().data.numpy()) for b_id in range(labels.shape[0]): true_str, _ = get_sent(model, de_tknize, labels, b_id) act_str, _ = get_sent(model, de_tknize, acts, b_id) all_metas.append({'utt': true_str, 'domain': domains[b_id], 'acts':act_str}) all_zs.extend(latent_acts) pickle.dump({'z': all_zs, "metas": all_metas}, open(os.path.join(log_dir, "latent-{}.p".format(utils.get_time())), 'wb')) logger.info("Dumping Done")
def get_intent_tagger(self, corpus): """ :return: train a dialog act tagger for system utterances """ self.logger.info("Train a new intent tagger") all_tags, utts, tags = [], [], [] de_tknize = get_dekenize() for msg in corpus: utts.append(de_tknize(msg.utt[1:-1])) tags.append([a['act'] for a in msg.actions]) all_tags.extend([a['act'] for a in msg.actions]) most_common = Counter(all_tags).most_common() self.logger.info(most_common) tag_set = [t for t, c, in most_common] rev_tag_set = {t: i for i, t in enumerate(tag_set)} # create train and test set: data_size = len(corpus) train_size = int(data_size * 0.7) train_utts = utts[0:train_size] test_utts = utts[train_size:] # create y: sparse_y = np.zeros([data_size, len(tag_set)]) for idx, utt_tags in enumerate(tags): for tag in utt_tags: sparse_y[idx, rev_tag_set[tag]] = 1 train_y = sparse_y[0:train_size, :] test_y = sparse_y[train_size:, :] # train classifier representation = CountVectorizer(ngram_range=[1, 2]).fit(train_utts) train_x = representation.transform(train_utts) test_x = representation.transform(test_utts) clf = OneVsRestClassifier( SGDClassifier(loss='hinge', n_iter_no_change=10)).fit(train_x, train_y) pred_test_y = clf.predict(test_x) def print_report(score_name, scores, names): for s, n in zip(scores, names): self.logger.info("%s: %s -> %f" % (score_name, n, s)) print_report('F1', metrics.f1_score(test_y, pred_test_y, average=None), tag_set) x = representation.transform(utts) clf = OneVsRestClassifier(SGDClassifier(loss='hinge', n_iter_no_change=20)) \ .fit(x, sparse_y) model_dump = { self.CLF: clf, self.REPRESENTATION: representation, self.ID2TAG: tag_set, self.TAG2ID: rev_tag_set } # pkl.dump(model_dump, open("{}.pkl".format(self.data_name), "w")) return model_dump
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None): model.eval() de_tknize = get_dekenize() def write(msg): if msg is None or msg == '': return if dest_f is None: logger.info(msg) else: dest_f.write(msg + '\n') data_feed.epoch_init(config, shuffle=num_batch is not None, verbose=False) evaluator.initialize() logger.info("Generation: {} batches".format( data_feed.num_batch if num_batch is None else num_batch)) while True: batch = data_feed.next_batch() if batch is None or (num_batch is not None and data_feed.ptr > num_batch): break outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type) # move from GPU to CPU labels = labels.cpu() pred_labels = [ t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE] ] pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0, 1) true_labels = labels.data.numpy() # get attention if possible if config.use_attn or config.use_ptr: pred_attns = [ t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_ATTN_SCORE] ] pred_attns = np.array(pred_attns, dtype=float).squeeze(2).swapaxes(0, 1) else: pred_attns = None # get last 1 context ctx = batch.get('contexts') ctx_len = batch.get('context_lens') domains = batch.domains attn_ctx = outputs.get(DecoderPointerGen.KEY_PTR_CTX) if attn_ctx is not None: attn_ctx = attn_ctx.cpu().data.numpy() attn_ctx = attn_ctx.reshape(attn_ctx.shape[0], -1) # logger.info the batch in String. for b_id in range(pred_labels.shape[0]): pred_str, attn = get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns, attn_ctx=attn_ctx) true_str, _ = get_sent(model, de_tknize, true_labels, b_id) prev_ctx = "" if ctx is not None: ctx_str, _ = get_sent(model, de_tknize, ctx[:, ctx_len[b_id] - 1, :], b_id) prev_ctx = "Source: {}".format(ctx_str) domain = domains[b_id] evaluator.add_example(true_str, pred_str, domain) if num_batch is None or num_batch <= 2: write(prev_ctx) write("{}:: True: {} ||| Pred: {}".format( domain, true_str, pred_str)) if attn: write("[[{}]]".format(attn)) write(evaluator.get_report(include_error=dest_f is not None)) logger.info("Generation Done")