def build(self, dts): counter = collections.defaultdict(int) counter_map = collections.defaultdict(int) for a, b in prolog(dts.pairs, name='count from pairs'): a_lst, b_lst = [], [] for i in a: a_lst += dts.examples[i]['keywords'] b_lst = dts.examples[b]['keywords'] for x in set(a_lst): for y in set(b_lst): counter_map[(x, y)] += 1 #if x != y: # counter_map[(y, x)] += 1 for x in set(a_lst + b_lst): counter[x] += 1 self.counter = counter self.counter_map = counter_map self.logM = np.math.log(len(dts.pairs)) from IPython import embed embed()
def __init__(self, predictor, data): self.predictor = predictor self.data = data self.res = [ predictor.predict(item['context'], item['utterance']) for item in prolog(data) ]
def build_graph(self, width): text_ids = self.corpus[0] corpus_size = len(self.corpus[0]) cache = cacher('infergraph.build_graph', width) if cache.cached: self.transition_mat = cache.data return row, col, data = [], [], [] for idx, text_ids in enumerate( prolog(self.corpus[0], name='infer graph: build graph')): probs, ids = self.predictor.retrieve(ids2data( text_ids, self.field), self.corpus, k=width) probs = torch.nn.functional.softmax(probs, dim=0) for prob, idy in zip(probs.tolist(), ids.tolist()): row.append(idx) col.append(idy) data.append(prob) self.transition_mat = cache.cache( csc_matrix((data, (row, col)), shape=(corpus_size, corpus_size)))
def test(self, dataset, config): loader = DataLoader(dataset, **config.test_loader.to_dict()) cnt = 0 for batch_id, data in enumerate(prolog(loader)): data = to_device(data) logits = self.get_logits(data) cnt += evaluation.torch_acc(logits, data['label']) print('acc={}'.format(cnt / len(loader)))
def build_corpus(self, dataset, config): loader = DataLoader(dataset, **config.loader.to_dict()) corpus = [] with torch.no_grad(): for batch_id, data in enumerate(prolog(loader, 'build corpus')): data = to_device(data) codes = self.get_code(data['uttr'], cands=True) for code, text_ids in zip(codes, data['uttr']['text_ids']): corpus.append((text_ids, code)) text_ids, codes = zip(*corpus) codes = torch.stack(codes, dim=0) return text_ids, codes
def get_vocab(self, vocab_size=10000): cache = cacher('dts_ConvAI2.get_vocab', vocab_size) if cache.cached: return cache.data counter = collections.Counter() dialogs = self.get_dialogs() for dialog in prolog(dialogs): for uttr in dialog: counter.update(tokenize(uttr)) return cache.cache([token for token, times in sorted( list(counter.items()), key=lambda x: (-x[1], x[0]))][:vocab_size])
def build_kws_for_corpus(self, *args): kwext = KeywordExtractor(self.predictor.field) cache = cacher('infergraph.build_kws_for_corpus', *args) if cache.cached: self.keywords = cache.data return keywords = [] for idx, text_ids in enumerate( prolog(self.corpus[0], name='infer graph: build kws')): keywords.append( kwext.extract(tokens2str(text_ids, self.field))['keywords']) self.keywords = cache.cache(keywords)
def raw_utterance_with_keyword(self, vocab, train=True): cache = cacher('dts_ConvAI2.raw_utterance_with_keyword', vocab, train) if cache.cached: return cache.data examples, corpus, check_dict = self.raw_utterance(train) field = Field(vocab) kwext = KeywordExtractor(field) for example in prolog(examples, name=' -extract keywords'): kws = kwext.extract(example['uttr'].lst) example['kwpos'] = kws['kwpos'] example['keywords'] = kws['keywords'] examples = preprocess(examples, field, log=' -process to_pack cls') return cache.cache((examples, field, corpus, check_dict))
def raw_utterance(self, train=True): cache = cacher('dts_ConvAI2.raw_utterance', train) if cache.cached: return cache.data corpus = self.get_data(train=train, cands=False) examples = [] check_dict = {} for sess in prolog(corpus, name=' -extract utterances from dialog'): for i, uttr in enumerate(sess['dialog']): if uttr not in check_dict: examples.append({'uttr': to_pack(uttr)}) check_dict[uttr] = len(examples) - 1 return cache.cache((examples, corpus, check_dict))
def utterance_with_cands_from_negative_sampling(self, vocab, train=True, num_turns=1, num_negs=20): examples, field, corpus, check_dict = \ self.raw_utterance_with_keyword(vocab, train) pairs = [] for sess in prolog(corpus, name='collect pairs'): for id, uttr in enumerate(sess['dialog']): if id % 2 == 1: for xs in range(min(num_turns, id)): pairs.append(( [check_dict[sess['dialog'][id-x-1]] for x in range(xs+1)], check_dict[uttr])) return NegativeSamplingDataset(examples, pairs, field).negs(num_negs)
def test(self, dataset, config): if dataset is None: return -1 loader = DataLoader(dataset, **config.data.test_set.loader.to_dict()) loss_cnt, acc_cnt = [], [] with torch.no_grad(): for batch_id, data in enumerate(prolog(loader)): data = to_device(data) logits = self.get_logits(data) loss = self.default_loss(logits, data['label'], data['num_candidates']) acc = evaluation.torch_acc(logits, data['label']) loss_cnt.append(loss.item()) acc_cnt.append(acc) print('test loss={:.5f}, acc={:5f}, '.format( np.mean(loss_cnt), np.mean(acc_cnt))) return np.mean(acc_cnt)
def utterance_with_cands(self, vocab, train=True, num_turns=1): cache = cacher('dts_ConvAI2.utterance_with_cands', vocab, train, num_turns) if cache.cached: return cache.data uttrs, field, corpus, check_dict = \ self.raw_utterance_with_keyword(vocab, train) corpus = self.get_data(train=train, cands=True) examples = [] for sess in prolog(corpus, name='mapping utterances...'): for id, candidates in enumerate(sess['candidates']): if candidates is not None: xs = min(num_turns, id+1) examples.append({ 'context': to_pack([ uttrs[check_dict[sess['dialog'][id-x]]]['uttr'].lst for x in range(xs)]), 'kwpos': [ uttrs[check_dict[sess['dialog'][id-x]]]['kwpos'] for x in range(xs)], 'keywords': list(itertools.chain([ uttrs[check_dict[sess['dialog'][id-x]]]['keywords'] for x in range(xs)])), 'keywords_target': uttrs[check_dict[candidates[-1]]]['keywords'], 'candidates': to_pack(preprocess( to_pack(candidates[:-1]), field).lst + [ uttrs[check_dict[candidates[-1]]]['uttr'].lst]), 'label': len(candidates) - 1, 'num_candidates': len(candidates) }) return cache.cache(Dataset(examples, field))