Beispiel #1
0
 def lexicalize_global(self, data: ConvData) -> Dialog:
     turns = list(map(self.lexicalize_turn_global, data))
     if self.boc:
         turns = utils.lstrip(turns, self.is_boc)
     if self.eoc:
         turns = utils.rstrip(turns, self.is_eoc)
     return Dialog(turns)
Beispiel #2
0
 def lexicalize_sent(self, tokens: torch.Tensor) -> str:
     if not self.is_initialized:
         raise RuntimeError(f"vocabulary unset")
     tokens = [self.vocabs.word.i2f.get(x.item(), "<unk>") for x in tokens]
     if self.sent_processor.bos:
         tokens = utils.lstrip(tokens, "<bos>")
     if self.sent_processor.eos:
         tokens = utils.rstrip(tokens, "<eos>")
     return " ".join(tokens)
Beispiel #3
0
 def compute_unigram_prob(self):
     prob = torch.ones(len(self.vocabs.word)).long()  # +1 smoothing
     spkr_prob = {spkr: torch.ones(len(self.vocabs.word)).long()
                  for spkr in self.speakers}
     for dialog in self.dataset.data:
         for turn in dialog.turns:
             if turn.speaker == "<unk>":
                 continue
             tokens = \
                 self.dataset.processor.sent_processor.process(turn.text)
             tokens = utils.lstrip(tokens, "<bos>")
             tokens = utils.rstrip(tokens, "<eos>")
             for word in tokens:
                 word_idx = self.vocabs.word[word]
                 spkr_prob[turn.speaker][word_idx] += 1
                 prob[word_idx] += 1
     prob = prob.float() / prob.sum()
     spkr_prob = {spkr: p.float() / p.sum() for spkr, p in spkr_prob.items()}
     return prob, spkr_prob
Beispiel #4
0
 def prepare_ngrams(self):
     for dialog in self.dataset.data:
         for turn in dialog.turns:
             spkr = turn.speaker
             if spkr == "<unk>":
                 continue
             if spkr not in self._spkr_bigrams:
                 self._spkr_bigrams[spkr] = set()
                 self._spkr_trigrams[spkr] = set()
                 self._spkr_sents[spkr] = set()
             tokens = \
                 self.dataset.processor.sent_processor.process(turn.text)
             tokens = utils.lstrip(tokens, "<bos>")
             tokens = utils.rstrip(tokens, "<eos>")
             for bigram in nltk.bigrams(tokens):
                 self._bigrams.add(tuple(bigram))
                 self._spkr_bigrams[spkr].add(tuple(bigram))
             for trigram in nltk.ngrams(tokens, 3):
                 self._trigrams.add(tuple(trigram))
                 self._spkr_trigrams[spkr].add(tuple(trigram))
             sent = " ".join(tokens)
             self._sents.add(sent)
             self._spkr_sents[spkr].add(sent)
Beispiel #5
0
 def test_basic_rstrip(self):
     rstrip('somefile.txt')
     with open('somefile.txt') as f:
         assert_equal(f.read(), '   first\n\n  third')