def evaluate_input(searcher, word2idx, idx2word, device):
    tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS)
    to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
    to_tensor = ToTensor()
    transforms = [tokenizer, to_token_ids, to_tensor]
    previous = None
    while True:
        try:
            # Get input sentence
            input_sentence1 = input('> ')
            if input_sentence1 == 'q' or input_sentence1 == 'quit': break

            # Normalize sentence
            input_sentence1 = normalizeString(input_sentence1)

            # Evaluate sentence
            for t in transforms:
                input_sentence1 = t(input_sentence1)

            output_words = evaluate(searcher, idx2word, previous,
                                    input_sentence1, device)
            previous = input_sentence1
            print(output_words)
            output_words[:] = [
                x for x in output_words if not (x == 'EOS' or x == 'PAD')
            ]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")
Beispiel #2
0
    def __init__(
        self,
        corpus: Union[List[str], List[List[str]]],
        word2idx: Dict[str, int] = None,
        special_tokens: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
        max_length: int = -1,
        **kwargs,
    ):
        """Wrap a corpus that's already tokenized

        Args:
            corpus (Union[List[str], List[List[str]]]): List of tokens or List of lists of tokens
            word2idx (Dict[str, int], optional): Token to index mapping. Defaults to None.
            special_tokens (Optional[SPECIAL_TOKENS], optional): Special Tokens. Defaults to SPECIAL_TOKENS.
        """
        self.corpus_ = corpus
        self.tokenized_corpus_ = corpus
        self.max_length = max_length

        self.vocab_ = create_vocab(
            self.tokenized_corpus_,
            vocab_size=-1,
            special_tokens=special_tokens,
        )

        if word2idx is not None:
            logger.info("Converting tokens to ids using word2idx.")
            self.word2idx_ = word2idx
        else:
            logger.info(
                "No word2idx provided. Will convert tokens to ids using an iterative counter."
            )
            self.word2idx_ = dict(zip(self.vocab_.keys(), itertools.count()))

        self.idx2word_ = {v: k for k, v in self.word2idx_.items()}

        self.to_token_ids = ToTokenIds(
            self.word2idx_,
            specials=SPECIAL_TOKENS,  # type: ignore
        )

        if isinstance(self.tokenized_corpus_[0], list):
            self.corpus_indices_ = [
                self.to_token_ids(s)
                for s in tqdm(
                    self.tokenized_corpus_,
                    desc="Converting tokens to token ids...",
                    leave=False,
                )
            ]
        else:
            self.corpus_indices_ = self.to_token_ids(self.tokenized_corpus_)  # type: ignore
Beispiel #3
0
            new_emb_file, emb_dim, extra_tokens=HRED_SPECIAL_TOKENS).load()
    else:
        word2idx, idx2word = word2idx_from_dataset(
            vocab_dict, most_freq=10000, extra_tokens=HRED_SPECIAL_TOKENS)
        embeddings = None
        emb_dim = options.emb_dim

    vocab_size = len(word2idx)
    print("Vocabulary size: {}".format(vocab_size))

    # --- set dataset transforms ---
    tokenizer = DialogSpacyTokenizer(lower=True,
                                     prepend_sos=True,
                                     append_eos=True,
                                     specials=HRED_SPECIAL_TOKENS)
    to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
    to_tensor = ToTensor()
    dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor)
    print("Dataset size: {}".format(len(dataset)))
    import ipdb
    ipdb.set_trace()
    # --- make train and val loaders ---

    collator_fn = HRED_Collator(device='cpu')
    train_loader, val_loader = train_test_split(dataset,
                                                batch_train=BATCH_TRAIN_SIZE,
                                                batch_val=BATCH_VAL_SIZE,
                                                collator_fn=collator_fn,
                                                test_size=0.2)

    pad_index = word2idx[HRED_SPECIAL_TOKENS.PAD.value]
Beispiel #4
0
    train, dev, test = wikitext_2_dataset(
        directory='data/',
        train=True,
        dev=True,
        test=True,
        extracted_name='wikitext-2',
        url=
        'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',  # noqa: E501
        unknown_token=SPECIAL_TOKENS.UNK.value,
        eos_token=SPECIAL_TOKENS.EOS.value)

    vocab = create_vocab(train + dev,
                         vocab_size=vocab_size,
                         extra_tokens=SPECIAL_TOKENS.to_list())
    replace_unk = ReplaceUnknownToken()
    to_token_ids = ToTokenIds(vocab)
    to_tensor = ToTensor(device='cpu')

    def create_dataloader(base):
        wrapped = (LMDataset(base, max_len=max_len).map(replace_unk).map(
            to_token_ids).map(to_tensor).apply_transforms())
        return DataLoader(wrapped,
                          batch_size=128,
                          num_workers=1,
                          pin_memory=True,
                          collate_fn=collate_fn)

    train_loader = create_dataloader(train[:1000])
    dev_loader = create_dataloader(dev[:1000])
    test_loader = create_dataloader(test[:1000])
        metrics=metrics,
        non_blocking=True,
        retain_graph=True,
        patience=5,
        device=device,
        loss_fn=criterion)
    return trainer


import os
if __name__ == '__main__':
    loader = EmbeddingsLoader('../cache/glove.6B.50d.txt', 50)
    word2idx, _, embeddings = loader.load()

    tokenizer = SpacyTokenizer()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device='cpu')

    transforms = Compose([tokenizer, to_token_ids, to_tensor])
    dataset = MovieCorpusDataset('../data/', transforms=transforms, train=True)
    #dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor)

    if KFOLD:
        cv_scores = []
        import gc
        for train_loader, val_loader in kfold_split(dataset, 32, 128):
            trainer = trainer_factory(embeddings, device=DEVICE)
            fold_score = trainer.fit(train_loader,
                                     val_loader,
                                     epochs=MAX_EPOCHS)
            cv_scores.append(fold_score)
Beispiel #6
0
    def __init__(
        self,
        corpus: List[str],
        limit_vocab_size: int = 30000,
        word2idx: Optional[Dict[str, int]] = None,
        idx2word: Optional[Dict[int, str]] = None,
        embeddings: Optional[np.ndarray] = None,
        embeddings_file: Optional[str] = None,
        embeddings_dim: int = 300,
        lower: bool = True,
        special_tokens: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
        prepend_bos: bool = False,
        append_eos: bool = False,
        lang: str = "en_core_web_md",
        max_length: int = -1,
        **kwargs,
    ):
        """Load corpus embeddings, tokenize in words using spacy and convert to ids

        This class handles the handling of a raw corpus. It handles:

        * Tokenization into words (spacy)
        * Loading of pretrained word embedding
        * Calculation of word frequencies / corpus statistics
        * Conversion to token ids

        You can pass either:

        * Pass an embeddings file to load pretrained embeddings and create the word2idx mapping
        * Pass already loaded embeddings array and word2idx. This is useful for the dev / test splits
          where we want to pass the train split embeddings / word2idx.

        Args:
            corpus (List[List[str]]): Corpus as a list of sentences
            limit_vocab_size (int): Upper bound for number of most frequent tokens to keep. Defaults to 30000.
            word2idx (Optional[Dict[str, int]]): Mapping of word to indices. Defaults to None.
            idx2word (Optional[Dict[int, str]]): Mapping of indices to words. Defaults to None.
            embeddings (Optional[np.ndarray]): Embeddings array. Defaults to None.
            embeddings_file (Optional[str]): Embeddings file to read. Defaults to None.
            embeddings_dim (int): Dimension of embeddings. Defaults to 300.
            lower (bool): Convert strings to lower case. Defaults to True.
            special_tokens (Optional[SPECIAL_TOKENS]): Special tokens to include in the vocabulary.
                 Defaults to slp.config.nlp.SPECIAL_TOKENS.
            prepend_bos (bool): Prepend Beginning of Sequence token for seq2seq tasks. Defaults to False.
            append_eos (bool): Append End of Sequence token for seq2seq tasks. Defaults to False.
            lang (str): Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".
            max_length (int): Crop sequences above this length. Defaults to -1 where sequences are left unaltered.
        """
        # FIXME: Extract super class to avoid repetition
        self.corpus_ = corpus
        self.max_length = max_length
        self.tokenizer = SpacyTokenizer(
            lower=lower,
            prepend_bos=prepend_bos,
            append_eos=append_eos,
            specials=special_tokens,
            lang=lang,
        )

        logger.info(f"Tokenizing corpus using spacy {lang}")

        self.tokenized_corpus_ = [
            self.tokenizer(s)
            for s in tqdm(self.corpus_, desc="Tokenizing corpus...", leave=False)
        ]

        self.vocab_ = create_vocab(
            self.tokenized_corpus_,
            vocab_size=limit_vocab_size if word2idx is None else -1,
            special_tokens=special_tokens,
        )

        self.word2idx_, self.idx2word_, self.embeddings_ = None, None, None
        # self.corpus_indices_ = self.tokenized_corpus_

        if word2idx is not None:
            logger.info("Word2idx was already provided. Going to used it.")

        if embeddings_file is not None and word2idx is None:
            logger.info(
                f"Going to load {len(self.vocab_)} embeddings from {embeddings_file}"
            )
            loader = EmbeddingsLoader(
                embeddings_file,
                embeddings_dim,
                vocab=self.vocab_,
                extra_tokens=special_tokens,
            )
            word2idx, idx2word, embeddings = loader.load()

        if embeddings is not None:
            self.embeddings_ = embeddings

        if idx2word is not None:
            self.idx2word_ = idx2word

        if word2idx is not None:
            self.word2idx_ = word2idx

            logger.info("Converting tokens to ids using word2idx.")
            self.to_token_ids = ToTokenIds(
                self.word2idx_,
                specials=SPECIAL_TOKENS,  # type: ignore
            )

            self.corpus_indices_ = [
                self.to_token_ids(s)
                for s in tqdm(
                    self.tokenized_corpus_,
                    desc="Converting tokens to token ids...",
                    leave=False,
                )
            ]

            logger.info("Filtering corpus vocabulary.")

            updated_vocab = {}

            for k, v in self.vocab_.items():
                if k in self.word2idx_:
                    updated_vocab[k] = v

            logger.info(
                f"Out of {len(self.vocab_)} tokens {len(self.vocab_) - len(updated_vocab)} were not found in the pretrained embeddings."
            )

            self.vocab_ = updated_vocab