def lexicalize_global(self, data: ConvData) -> Dialog: turns = list(map(self.lexicalize_turn_global, data)) if self.boc: turns = utils.lstrip(turns, self.is_boc) if self.eoc: turns = utils.rstrip(turns, self.is_eoc) return Dialog(turns)
def lexicalize_sent(self, tokens: torch.Tensor) -> str: if not self.is_initialized: raise RuntimeError(f"vocabulary unset") tokens = [self.vocabs.word.i2f.get(x.item(), "<unk>") for x in tokens] if self.sent_processor.bos: tokens = utils.lstrip(tokens, "<bos>") if self.sent_processor.eos: tokens = utils.rstrip(tokens, "<eos>") return " ".join(tokens)
def compute_unigram_prob(self): prob = torch.ones(len(self.vocabs.word)).long() # +1 smoothing spkr_prob = {spkr: torch.ones(len(self.vocabs.word)).long() for spkr in self.speakers} for dialog in self.dataset.data: for turn in dialog.turns: if turn.speaker == "<unk>": continue tokens = \ self.dataset.processor.sent_processor.process(turn.text) tokens = utils.lstrip(tokens, "<bos>") tokens = utils.rstrip(tokens, "<eos>") for word in tokens: word_idx = self.vocabs.word[word] spkr_prob[turn.speaker][word_idx] += 1 prob[word_idx] += 1 prob = prob.float() / prob.sum() spkr_prob = {spkr: p.float() / p.sum() for spkr, p in spkr_prob.items()} return prob, spkr_prob
def prepare_ngrams(self): for dialog in self.dataset.data: for turn in dialog.turns: spkr = turn.speaker if spkr == "<unk>": continue if spkr not in self._spkr_bigrams: self._spkr_bigrams[spkr] = set() self._spkr_trigrams[spkr] = set() self._spkr_sents[spkr] = set() tokens = \ self.dataset.processor.sent_processor.process(turn.text) tokens = utils.lstrip(tokens, "<bos>") tokens = utils.rstrip(tokens, "<eos>") for bigram in nltk.bigrams(tokens): self._bigrams.add(tuple(bigram)) self._spkr_bigrams[spkr].add(tuple(bigram)) for trigram in nltk.ngrams(tokens, 3): self._trigrams.add(tuple(trigram)) self._spkr_trigrams[spkr].add(tuple(trigram)) sent = " ".join(tokens) self._sents.add(sent) self._spkr_sents[spkr].add(sent)
def test_basic_rstrip(self): rstrip('somefile.txt') with open('somefile.txt') as f: assert_equal(f.read(), ' first\n\n third')