def __init__(self, model: SiameseModel, batch_size: int, num_context_turns: int = 1, ranking: bool = True, attention: bool = False, responses: SimpleVocabulary = None, preproc_func: Callable = None, interact_pred_num: int = 3, *args, **kwargs) -> None: super().__init__() self.batch_size = batch_size self.num_context_turns = num_context_turns self.ranking = ranking self.attention = attention self.preproc_responses = [] self.response_embeddings = None self.preproc_func = preproc_func self.interact_pred_num = interact_pred_num self.model = model if self.ranking: self.responses = {el[1]: el[0] for el in responses.items()} self._build_preproc_responses() if not self.attention: self._build_response_embeddings()
tokenizer(['Kaggle is the best place to study machine learning.']) train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0])) """##Vocabulary""" # initialize simple vocabulary to collect all appeared in the dataset classes classes_vocab = SimpleVocabulary( save_path='./tmp/classes.dict', load_path='./tmp/classes.dict') classes_vocab.fit((train_iterator.get_instances(data_type='train')[1])) classes_vocab.save() # show classes list(classes_vocab.items()) # also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset token_vocab = SimpleVocabulary( save_path='./tmp/tokens.dict', load_path='./tmp/tokens.dict', min_freq=2, special_tokens=('<PAD>', '<UNK>',), unk_token='<UNK>') token_vocab.fit(train_x_lower_tokenized) token_vocab.save() # number of tokens in dictionary len(token_vocab)
for x, y in list(zip(x_train, y_train))[:3]: print('x:', x) print('y:', y) print('=================') # tokenize all input data tokenizer = NLTKMosesTokenizer() train_x_lower_tokenized = str_lower( tokenizer(train_iterator.get_instances(data_type='train')[0])) # get the intent categories classes_vocab = SimpleVocabulary(save_path='./tmp/classes.dict', load_path='./tmp/classes.dict') classes_vocab.fit(train_iterator.get_instances(data_type='train')[1]) classes_vocab.save() print(list(classes_vocab.items())) # display classes # get all token vocab token_vocab = SimpleVocabulary(save_path='./tmp/tokens.dict', load_path='./tmp/tokens.dict') token_vocab.fit(train_x_lower_tokenized) token_vocab.save() # we will use GLOVE embedding if not os.path.isfile("./glove.6B.100d.txt"): simple_download( url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt", destination="./glove.6B.100d.txt") embedder = GloVeEmbedder(load_path='./glove.6B.100d.txt', dim=100, pad_zero=True)