def evaluate_input(searcher, word2idx, idx2word, device):
    tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS)
    to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
    to_tensor = ToTensor()
    transforms = [tokenizer, to_token_ids, to_tensor]
    previous = None
    while True:
        try:
            # Get input sentence
            input_sentence1 = input('> ')
            if input_sentence1 == 'q' or input_sentence1 == 'quit': break

            # Normalize sentence
            input_sentence1 = normalizeString(input_sentence1)

            # Evaluate sentence
            for t in transforms:
                input_sentence1 = t(input_sentence1)

            output_words = evaluate(searcher, idx2word, previous,
                                    input_sentence1, device)
            previous = input_sentence1
            print(output_words)
            output_words[:] = [
                x for x in output_words if not (x == 'EOS' or x == 'PAD')
            ]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")
Esempio n. 2
0
File: dm.py Progetto: georgepar/slp
    def setup(self, stage=None):
        if self.setup_has_run:
            return

        super(PLDataModuleFromCorpus, self).setup(stage=stage)

        train_corpus, train_labels = zip(*self.train)  # type: ignore
        val_corpus, val_labels = zip(*self.val)  # type: ignore

        if not self.no_test_set:
            test_corpus, test_labels = zip(*self.test)  # type: ignore

        self.train_corpus, self.val_corpus, self.test_corpus = self._create_corpora(
            train_corpus, val_corpus, test_corpus, self.corpus_args)

        to_tensor = ToTensor(device="cpu")

        if self.language_model:
            self.train = CorpusLMDataset(self.train_corpus).map(to_tensor)
            self.val = CorpusLMDataset(self.val_corpus).map(to_tensor)

            if not self.no_test_set:
                self.test = CorpusLMDataset(self.test_corpus).map(to_tensor)
        else:
            self.train = CorpusDataset(self.train_corpus,
                                       train_labels).map(to_tensor)
            self.val = CorpusDataset(self.val_corpus,
                                     val_labels).map(to_tensor)

            if not self.no_test_set:
                self.test = CorpusDataset(self.test_corpus,
                                          test_labels).map(to_tensor)
Esempio n. 3
0
    def __init__(
        self,
        data: List[Dict[str, Any]],
        modalities: Union[List[str], Set[str]] = {"text", "audio", "visual"},
        text_is_tokens: bool = False,
        label_selector: Optional[Callable] = None,
    ):
        super(MOSEI, self).__init__(data, modalities)

        def default_label_selector(l):
            return l[0][0]

        if label_selector is None:
            label_selector = default_label_selector

        self.map(label_selector, "label", lazy=True)

        for m in self.modalities:
            if m == "text" and text_is_tokens:
                self.map(ToTensor(dtype=torch.long), m, lazy=True)
            else:
                self.map(ToTensor(dtype=torch.float), m, lazy=True)
Esempio n. 4
0
    def __init__(
        self,
        data: List[Dict[str, Any]],
        modalities: Union[List[str], Set[str]] = {"text", "audio", "visual"},
        text_is_tokens: bool = False,
        binary: bool = False,
    ):
        super(MOSI, self).__init__(data, modalities)

        def label_selector(l):
            return l.item()

        self.map(label_selector, "label", lazy=True)

        if binary:
            self.map(binarize, "label", lazy=True)

        for m in self.modalities:
            if m == "text" and text_is_tokens:
                self.map(ToTensor(dtype=torch.long), m, lazy=True)
            else:
                self.map(ToTensor(dtype=torch.float), m, lazy=True)
Esempio n. 5
0
    else:
        word2idx, idx2word = word2idx_from_dataset(
            vocab_dict, most_freq=10000, extra_tokens=HRED_SPECIAL_TOKENS)
        embeddings = None
        emb_dim = options.emb_dim

    vocab_size = len(word2idx)
    print("Vocabulary size: {}".format(vocab_size))

    # --- set dataset transforms ---
    tokenizer = DialogSpacyTokenizer(lower=True,
                                     prepend_sos=True,
                                     append_eos=True,
                                     specials=HRED_SPECIAL_TOKENS)
    to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
    to_tensor = ToTensor()
    dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor)
    print("Dataset size: {}".format(len(dataset)))
    import ipdb
    ipdb.set_trace()
    # --- make train and val loaders ---

    collator_fn = HRED_Collator(device='cpu')
    train_loader, val_loader = train_test_split(dataset,
                                                batch_train=BATCH_TRAIN_SIZE,
                                                batch_val=BATCH_VAL_SIZE,
                                                collator_fn=collator_fn,
                                                test_size=0.2)

    pad_index = word2idx[HRED_SPECIAL_TOKENS.PAD.value]
    sos_index = word2idx[HRED_SPECIAL_TOKENS.SOS.value]
Esempio n. 6
0
        directory='data/',
        train=True,
        dev=True,
        test=True,
        extracted_name='wikitext-2',
        url=
        'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',  # noqa: E501
        unknown_token=SPECIAL_TOKENS.UNK.value,
        eos_token=SPECIAL_TOKENS.EOS.value)

    vocab = create_vocab(train + dev,
                         vocab_size=vocab_size,
                         extra_tokens=SPECIAL_TOKENS.to_list())
    replace_unk = ReplaceUnknownToken()
    to_token_ids = ToTokenIds(vocab)
    to_tensor = ToTensor(device='cpu')

    def create_dataloader(base):
        wrapped = (LMDataset(base, max_len=max_len).map(replace_unk).map(
            to_token_ids).map(to_tensor).apply_transforms())
        return DataLoader(wrapped,
                          batch_size=128,
                          num_workers=1,
                          pin_memory=True,
                          collate_fn=collate_fn)

    train_loader = create_dataloader(train[:1000])
    dev_loader = create_dataloader(dev[:1000])
    test_loader = create_dataloader(test[:1000])

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    hidden_size = 300
    epochs = 40
    lexicons = False
    lex_size = 99

    loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300)
    word2idx, idx2word, embeddings = loader.load()
    embeddings = torch.tensor(embeddings)

    with open("avec.pkl", "rb") as handle:
        _file = pickle.load(handle)

    tokenizer = SpacyTokenizer()
    replace_unknowns = ReplaceUnknownToken()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device=DEVICE)

    train = AVECDataset(_file,
                        max_word_length,
                        transforms=Compose([
                            tokenizer, replace_unknowns, to_token_ids,
                            to_tensor
                        ]),
                        split='train')
    dev = AVECDataset(_file,
                      max_word_length,
                      transforms=Compose([
                          tokenizer, replace_unknowns, to_token_ids, to_tensor
                      ]),
                      split='dev')
    test = AVECDataset(_file,
Esempio n. 8
0
from slp.modules.transformer import Transformer
from slp.util.pytorch import pad_mask, subsequent_mask

simplefilter(action="ignore")

pl.utilities.seed.seed_everything(42)

collate_fn = Seq2SeqCollator(device="cpu")

# All tokens are different. Should get 100% accuracy
sentence = "The big brown fox jumps over the lazy dog".split(" ")

vocab = create_vocab([sentence], vocab_size=-1, special_tokens=SPECIAL_TOKENS)
vocab = dict(zip(vocab.keys(), itertools.count()))
to_token_ids = ToTokenIds(vocab)
to_tensor = ToTensor(device="cpu")


class DummyDataset(Dataset):
    def __init__(self):
        self.data = [(sentence[0:-1], sentence[1:])]

    def __len__(self):
        return 1

    def __getitem__(self, i):
        s, t = self.data[i]

        return to_tensor(to_token_ids(s)), to_tensor(to_token_ids(t))