Ejemplo n.º 1
0
    def build(self, dts):
        counter = collections.defaultdict(int)

        counter_map = collections.defaultdict(int)
        for a, b in prolog(dts.pairs, name='count from pairs'):
            a_lst, b_lst = [], []
            for i in a:
                a_lst += dts.examples[i]['keywords']
            b_lst = dts.examples[b]['keywords']

            for x in set(a_lst):
                for y in set(b_lst):
                    counter_map[(x, y)] += 1
                    #if x != y:
                    #    counter_map[(y, x)] += 1

            for x in set(a_lst + b_lst):
                counter[x] += 1

        self.counter = counter
        self.counter_map = counter_map
        self.logM = np.math.log(len(dts.pairs))

        from IPython import embed
        embed()
Ejemplo n.º 2
0
 def __init__(self, predictor, data):
     self.predictor = predictor
     self.data = data
     self.res = [
         predictor.predict(item['context'], item['utterance'])
         for item in prolog(data)
     ]
Ejemplo n.º 3
0
    def build_graph(self, width):
        text_ids = self.corpus[0]
        corpus_size = len(self.corpus[0])

        cache = cacher('infergraph.build_graph', width)
        if cache.cached:
            self.transition_mat = cache.data
            return

        row, col, data = [], [], []
        for idx, text_ids in enumerate(
                prolog(self.corpus[0], name='infer graph: build graph')):
            probs, ids = self.predictor.retrieve(ids2data(
                text_ids, self.field),
                                                 self.corpus,
                                                 k=width)
            probs = torch.nn.functional.softmax(probs, dim=0)

            for prob, idy in zip(probs.tolist(), ids.tolist()):
                row.append(idx)
                col.append(idy)
                data.append(prob)

        self.transition_mat = cache.cache(
            csc_matrix((data, (row, col)), shape=(corpus_size, corpus_size)))
Ejemplo n.º 4
0
    def test(self, dataset, config):
        loader = DataLoader(dataset, **config.test_loader.to_dict())

        cnt = 0
        for batch_id, data in enumerate(prolog(loader)):
            data = to_device(data)
            logits = self.get_logits(data)
            cnt += evaluation.torch_acc(logits, data['label'])

        print('acc={}'.format(cnt / len(loader)))
Ejemplo n.º 5
0
    def build_corpus(self, dataset, config):
        loader = DataLoader(dataset, **config.loader.to_dict())
        corpus = []
        with torch.no_grad():
            for batch_id, data in enumerate(prolog(loader, 'build corpus')):
                data = to_device(data)
                codes = self.get_code(data['uttr'], cands=True)

                for code, text_ids in zip(codes, data['uttr']['text_ids']):
                    corpus.append((text_ids, code))

        text_ids, codes = zip(*corpus)
        codes = torch.stack(codes, dim=0)
        return text_ids, codes
Ejemplo n.º 6
0
    def get_vocab(self, vocab_size=10000):
        cache = cacher('dts_ConvAI2.get_vocab', vocab_size)
        if cache.cached:
            return cache.data

        counter = collections.Counter()
        dialogs = self.get_dialogs()

        for dialog in prolog(dialogs):
            for uttr in dialog:
                counter.update(tokenize(uttr))

        return cache.cache([token for token, times in sorted(
            list(counter.items()), key=lambda x: (-x[1], x[0]))][:vocab_size])
Ejemplo n.º 7
0
    def build_kws_for_corpus(self, *args):
        kwext = KeywordExtractor(self.predictor.field)

        cache = cacher('infergraph.build_kws_for_corpus', *args)
        if cache.cached:
            self.keywords = cache.data
            return

        keywords = []
        for idx, text_ids in enumerate(
                prolog(self.corpus[0], name='infer graph: build kws')):
            keywords.append(
                kwext.extract(tokens2str(text_ids, self.field))['keywords'])

        self.keywords = cache.cache(keywords)
Ejemplo n.º 8
0
    def raw_utterance_with_keyword(self, vocab, train=True):
        cache = cacher('dts_ConvAI2.raw_utterance_with_keyword', vocab, train)
        if cache.cached:
            return cache.data

        examples, corpus, check_dict = self.raw_utterance(train)
        field = Field(vocab)

        kwext = KeywordExtractor(field)
        for example in prolog(examples, name=' -extract keywords'):
            kws = kwext.extract(example['uttr'].lst)
            example['kwpos'] = kws['kwpos'] 
            example['keywords'] = kws['keywords'] 

        examples = preprocess(examples, field, log=' -process to_pack cls')
        return cache.cache((examples, field, corpus, check_dict))
Ejemplo n.º 9
0
    def raw_utterance(self, train=True):
        cache = cacher('dts_ConvAI2.raw_utterance', train)
        if cache.cached:
            return cache.data

        corpus = self.get_data(train=train, cands=False)

        examples = []
        check_dict = {}
        for sess in prolog(corpus, name=' -extract utterances from dialog'):
            for i, uttr in enumerate(sess['dialog']):
                if uttr not in check_dict:
                    examples.append({'uttr': to_pack(uttr)})
                    check_dict[uttr] = len(examples) - 1

        return cache.cache((examples, corpus, check_dict))
Ejemplo n.º 10
0
    def utterance_with_cands_from_negative_sampling(self, 
                                                    vocab, 
                                                    train=True,
                                                    num_turns=1,
                                                    num_negs=20):
        examples, field, corpus, check_dict = \
            self.raw_utterance_with_keyword(vocab, train)

        pairs = []
        for sess in prolog(corpus, name='collect pairs'):
            for id, uttr in enumerate(sess['dialog']):
                if id % 2 == 1:
                    for xs in range(min(num_turns, id)): 
                        pairs.append((
                            [check_dict[sess['dialog'][id-x-1]] 
                                for x in range(xs+1)], check_dict[uttr]))

        return NegativeSamplingDataset(examples, pairs, field).negs(num_negs)
Ejemplo n.º 11
0
    def test(self, dataset, config):
        if dataset is None:
            return -1

        loader = DataLoader(dataset, **config.data.test_set.loader.to_dict())
        loss_cnt, acc_cnt = [], []
        with torch.no_grad():
            for batch_id, data in enumerate(prolog(loader)):
                data = to_device(data)
                logits = self.get_logits(data)
                
                loss = self.default_loss(logits, data['label'],
                    data['num_candidates'])
                acc = evaluation.torch_acc(logits, data['label'])

                loss_cnt.append(loss.item())
                acc_cnt.append(acc)

        print('test loss={:.5f}, acc={:5f}, '.format(
            np.mean(loss_cnt), np.mean(acc_cnt)))
        return np.mean(acc_cnt)
Ejemplo n.º 12
0
    def utterance_with_cands(self, vocab, train=True, num_turns=1):
        cache = cacher('dts_ConvAI2.utterance_with_cands', 
            vocab, train, num_turns)
        if cache.cached:
            return cache.data
        
        uttrs, field, corpus, check_dict = \
            self.raw_utterance_with_keyword(vocab, train)

        corpus = self.get_data(train=train, cands=True)

        examples = []
        for sess in prolog(corpus, name='mapping utterances...'):
            for id, candidates in enumerate(sess['candidates']): 
                if candidates is not None:
                    xs = min(num_turns, id+1)
                    examples.append({
                        'context': to_pack([
                            uttrs[check_dict[sess['dialog'][id-x]]]['uttr'].lst
                            for x in range(xs)]),
                        'kwpos': [
                            uttrs[check_dict[sess['dialog'][id-x]]]['kwpos']
                            for x in range(xs)], 
                        'keywords': list(itertools.chain([
                            uttrs[check_dict[sess['dialog'][id-x]]]['keywords']
                            for x in range(xs)])),
                        'keywords_target': 
                            uttrs[check_dict[candidates[-1]]]['keywords'],
                        'candidates': to_pack(preprocess(
                            to_pack(candidates[:-1]), field).lst + [
                                uttrs[check_dict[candidates[-1]]]['uttr'].lst]),
                        'label': len(candidates) - 1, 
                        'num_candidates': len(candidates)
                    })

        return cache.cache(Dataset(examples, field))