def get_report(self, include_error=False): reports = [] tokenize = get_tokenize() for domain, labels in self.domain_labels.items(): predictions = self.domain_hyps[domain] self.logger.info("Generate report for {} for {} samples".format( domain, len(predictions))) refs, hyps = [], [] for label, hyp in zip(labels, predictions): label = label.replace(EOS, '').replace(BOS, '') hyp = hyp.replace(EOS, '').replace(BOS, '') ref_tokens = tokenize(label)[2:] hyp_tokens = tokenize(hyp)[2:] refs.append([ref_tokens]) hyps.append(hyp_tokens) # compute corpus level scores bleu = bleu_score.corpus_bleu( refs, hyps, smoothing_function=SmoothingFunction().method1) report = "\nDomain: %s BLEU %f\n" % (domain, bleu) reports.append(report) return "\n==== REPORT===={report}".format( report="========".join(reports))
def __init__(self, config): self.config = config self._path = config.data_dir[0] self.max_utt_len = config.max_utt_len self.tokenize = get_tokenize() self.black_domains = config.black_domains self.black_ratio = config.black_ratio self.speaker_map = {'assistant': SYS, 'driver': USR} self.domain_descriptions = [] self.train_corpus = self._read_file( os.path.join(self._path, 'kvret_train_public.json')) self.valid_corpus = self._read_file( os.path.join(self._path, 'kvret_dev_public.json')) self.test_corpus = self._read_file( os.path.join(self._path, 'kvret_test_public.json')) self.domains = set([dialog[0].domain for dialog in self.train_corpus]) with open(os.path.join(self._path, 'kvret_entities.json'), 'rb') as f: self.ent_metas = json.load(f) #if self.config.lowercase: # self.ent_metas = self._lowercase_json(self.ent_metas) if self.config.domain_description == 'annotated': self.domain_descriptions = self._read_domain_descriptions_annotated( self._path) if self.config.domain_description == 'kb': self.domain_descriptions = self._read_domain_descriptions_annotated( self._path) self._build_vocab() print("Done loading corpus")
def __init__(self, config): self.config = config self._path = config.data_dir[0] if isinstance(self._path, list): self._path = self._path[0] self.max_utt_len = config.max_utt_len self.tokenize = get_tokenize() self.black_domains = config.black_domains self.black_ratio = config.black_ratio self.corpus = self._read_file(self._path) self.train_corpus, devtest = train_test_split(self.corpus, test_size=0.2, random_state=271) self.valid_corpus, self.test_corpus = train_test_split( devtest, test_size=0.5, random_state=271) # TODO: update slot/value map relevant for Maluuba with open(self.config.entities_file, 'rb') as f: self.ent_metas = json.load(f) self.domain_descriptions = self._read_domain_descriptions( os.path.dirname(self._path)) if hasattr(self.config, 'vocab') and self.config.vocab: self.vocab, self.rev_vocab, self.unk_id = load_vocab( self.config.vocab) else: self._build_vocab(self.config.max_vocab_cnt) print("Done loading corpus")
def get_report(self, include_error=False): reports = [] tokenize = get_tokenize() domain = 'movie' self.logger.info("Generate report for {} samples".format( len(self.domain_hyps[domain]))) refs, hyps = [], [] tp, fp, fn = 0, 0, 0 for label, hyp in zip(self.domain_labels[domain], self.domain_hyps[domain]): # label = label.replace(EOS, '').replace(BOS, '') # hyp = hyp.replace(EOS, '').replace(BOS, '') ref_tokens = [BOS] + tokenize( label.replace(SYS, '').replace(USR, '').strip()) + [EOS] hyp_tokens = [BOS] + tokenize( hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS] refs.append([ref_tokens]) hyps.append(hyp_tokens) ref_entities = self._parse_entities(ref_tokens) hyp_entities = self._parse_entities(hyp_tokens) tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities) tp += tpp fp += fpp fn += fnn # compute corpus level scores bleu = BLEUScorer().score(hyps, refs) prec, rec, f1 = self._get_prec_recall(tp, fp, fn) report = "\nDomain: {} BLEU score {:.4f}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format( domain, bleu, prec, rec, f1) reports.append(report) return "\n==== REPORT===={report}".format( report="========".join(reports))
def __init__(self, config): self.config = config self._path = config.data_dir[0] self.max_utt_len = config.max_utt_len self.tokenize = get_tokenize() self.corpus = self._read_file(self._path) self.train_corpus, devtest = train_test_split(self.corpus, test_size=0.2, random_state=271) self.valid_corpus, self.test_corpus = train_test_split( devtest, test_size=0.5, random_state=271) if hasattr(self.config, 'vocab') and self.config.vocab: self.vocab, self.rev_vocab, self.unk_id = load_vocab( self.config.vocab) else: self._build_vocab(self.config.max_vocab_cnt) print("Done loading corpus")
def get_report(self): tokenize = get_tokenize() print('Generate report for {} samples'.format(len(self.hyps))) refs, hyps = [], [] for label, hyp in zip(self.labels, self.hyps): # label = label.replace(EOS, '') # hyp = hyp.replace(EOS, '') # ref_tokens = tokenize(label)[1:] # hyp_tokens = tokenize(hyp)[1:] ref_tokens = tokenize(label) hyp_tokens = tokenize(hyp) refs.append([ref_tokens]) hyps.append(hyp_tokens) bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) report = '\n===== BLEU = %f =====\n' % (bleu, ) return '\n===== REPORT FOR DATASET {} ====={}'.format( self.data_name, report)
def __init__(self, name, data, config, warmup_data=None): super(ZslDSTCDataLoader, self).__init__(name) self.max_utt_size = config.max_utt_len self.tokenize = get_tokenize() self.data = self.flatten_dialog(data, config.backward_size) self.data_size = len(self.data) data_lens = [len(line.context) for line in self.data] if False: self.indexes = list(np.argsort(data_lens))[::-1] else: self.indexes = range(len(data_lens)) # prepare indexes for warm up self.warmup_data = warmup_data if self.warmup_data is not None: self.warmup_size = len(self.warmup_data) self.warmup_indexes = range(self.warmup_size) self.warmup_flags = None self.warmup_num_batch = None
def get_report(self, include_error=False): reports = [] tokenize = get_tokenize() refs, hyps = [], [] for label, hyp in zip(self.labels, self.hyps): label = label.replace(EOS, '').replace(BOS, '') hyp = hyp.replace(EOS, '').replace(BOS, '') ref_tokens = tokenize(label)[2:] hyp_tokens = tokenize(hyp)[2:] refs.append([ref_tokens]) hyps.append(hyp_tokens) # compute corpus level scores bleu = bleu_score.corpus_bleu( refs, hyps, smoothing_function=SmoothingFunction().method1) report = "\nBLEU %f\n" % (bleu) reports.append(report) return "\n==== REPORT===={report}".format( report="========".join(reports))