def load_data(self, prepared_data_file=None): prepared_data_file = prepared_data_file or self.prepared_data_file print(f"Loading prepared data from {prepared_data_file} ...") data = torch.load(prepared_data_file) self.data = {"train": Dataset(data['train']), "valid": Dataset(data["valid"]), "test": Dataset(data["test"])} print("Number of examples:", " ".join(f"{k.upper()}-{len(v)}" for k, v in self.data.items()))
def transform(self, data_file, batch_size, data_type="test", shuffle=False, device=None): """ Transform raw text from data_file to Dataset and create data loader. """ raw_data = self.read_data(data_file, data_type=data_type) examples = self.build_examples(raw_data) data = Dataset(examples) data_loader = data.create_batches(batch_size, shuffle, device) return data_loader
def load_data(self, prepared_data_file=None): """ load_data """ prepared_data_file = prepared_data_file or self.prepared_data_file print("Loading prepared data from {} ...".format(prepared_data_file)) data = torch.load(prepared_data_file) self.data = {"train": Dataset(data['train']), "valid": Dataset(data["valid"]), "test": Dataset(data["test"])} print("Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def reload(self, data_type='test'): data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type) data_raw = self.read_data(data_file, data_type="test") data_examples = self.build_examples(data_raw) self.data[data_type] = Dataset(data_examples) print("Number of examples:", " ".join(f"{k.upper()}-{len(v)}" for k, v in self.data.items()))
def reload(self, data_type='test'): """ reload 训练时把demo.test覆盖了,测试时不用删除,reload会重新搞的 """ data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type) data_raw = self.read_data(data_file, data_type="test") data_examples = self.build_examples(data_raw) self.data[data_type] = Dataset(data_examples) print("Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def reload(self, data_type='test', data_file=None): """ reload """ if data_file is not None: data_raw = self.read_data(data_file, data_type="test") data_examples = self.build_examples(data_raw) self.data[data_type] = Dataset(data_examples) print( "Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def gen_response(self, contexts): """ Return a list of responses to each context. :param contexts: list, a list of context, each context is a dict that contains the dialogue history and personal profile of each speaker this dict contains following keys: context['dialog']: a list of string, dialogue histories (tokens in each utterances are separated using spaces). context['uid']: a list of int, indices to the profile of each speaker context['profile']: a list of dict, personal profiles for each speaker context['responder_profile']: dict, the personal profile of the responder :return: list, responses for each context, each response is a list of tokens. e.g. contexts: [{ "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ], "uid": [0, 1], "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" }, { "loc":"Shanghai", "gender":"female", "tag":"" } ], "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" } }] ==> [['I', 'am', 'fine', 'too', '!']] """ test_raw = self.read_data(contexts[0]) test_data = self.corpus.build_examples(test_raw, data_type='test') dataset = Dataset(test_data) data_iter = dataset.create_batches(batch_size=1, shuffle=False, device=self.config.gpu) results = self.generator.generate(batch_iter=data_iter) res = [result.preds[0].split(" ") for result in results] return res
def reload(self, data_type='test'): """ reload """ data_file1 = os.path.join(self.data_dir, self.data_prefix + "." + 'test1') data_file2 = os.path.join(self.data_dir, self.data_prefix + "." + 'test2') data_raw1, data_raw2 = self.read_data_multitask(data_file1, data_file2, data_type="test") data_examples1 = self.build_examples(data_raw1) data_examples2 = self.build_examples(data_raw2) self.data[data_type] = Dataset((data_examples1, data_examples2)) print( "Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def next_word_probability(self, context, partial_out): """ Return probability distribution over next words given a partial true output. This is used to calculate the per-word perplexity. :param context: dict, contexts containing the dialogue history and personal profile of each speaker this dict contains following keys: context['dialog']: a list of string, dialogue histories (tokens in each utterances are separated using spaces). context['uid']: a list of int, indices to the profile of each speaker context['profile']: a list of dict, personal profiles for each speaker context['responder_profile']: dict, the personal profile of the responder :param partial_out: list, previous "true" words :return: a list, the first element is a dict, where each key is a word and each value is a probability score for that word. Unset keys assume a probability of zero. the second element is the probability for the EOS token e.g. context: { "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ], "uid": [0, 1], "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" }, { "loc":"Shanghai", "gender":"female", "tag":"" } ], "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" } } partial_out: ['I', 'am'] ==> {'fine': 0.9}, 0.1 """ test_raw = self.read_data(context) test_data = self.corpus.build_examples(test_raw, data_type='test') dataset = Dataset(test_data) data_iter = dataset.create_batches(batch_size=1, shuffle=False, device=self.config.gpu) inputs = None for batch in data_iter: inputs = batch break partial_out_idx = [ self.stoi[s] if s in self.stoi.keys() else self.stoi['<unk>'] for s in partial_out ] # switch the model to evaluate mode self.model.eval() with torch.no_grad(): enc_outputs, dec_init_state = self.model.encode(inputs) long_tensor_type = torch.cuda.LongTensor if self.config.use_gpu else torch.LongTensor # Initialize the input vector input_var = long_tensor_type([self.BOS] * 1) # Inflate the initial hidden states to be of size: (1, H) dec_state = dec_init_state.inflate(1) for t in range(len(partial_out_idx)): # Run the RNN one step forward output, dec_state, attn = self.model.decode( input_var, dec_state) input_var = long_tensor_type([partial_out_idx[t]]) output, dec_state, attn = self.model.decode(input_var, dec_state) log_softmax_output = output.squeeze(1) log_softmax_output = log_softmax_output.cpu().numpy() prob_output = [math.exp(i) for i in log_softmax_output[0]] # The first 4 tokens are: '<pad>' '<unk>' '<bos>' '<eos>' freq_dict = {} for i in range(4, len(self.itos)): freq_dict[self.itos[i]] = prob_output[i] eos_prob = prob_output[3] return freq_dict, eos_prob