def lookup(self, input): conversation_id_list = input["conversation_ids"] input_mask = FloatTensor(input["input_mask"]) max_num_utterances_batch = input['max_num_utterances'] utterance_ids_list = input['utterance_ids_list'] batch_embeddings = [] for x, id in enumerate(conversation_id_list): embeddings = self.embeddings[id] batch_embeddings += embeddings batch_embeddings += [np.random.rand(self.args.embed_size).tolist() for i in range(max_num_utterances_batch - len(embeddings))] batch_embedding_tensor = FloatTensor(batch_embeddings) return batch_embedding_tensor, input_mask
def lookup_by_name(self, input, name_embed, name_mask): conversation_id_list = input[name_embed] input_mask = FloatTensor(input[name_mask]) # Generally remains the same max_num_utterances_batch = input['max_num_utterances'] batch_embeddings = [] for x, id in enumerate(conversation_id_list): embeddings = self.embeddings[id] batch_embeddings += embeddings batch_embeddings += [np.random.rand(self.args.embed_size).tolist() for i in range(max_num_utterances_batch - len(embeddings))] batch_embedding_tensor = FloatTensor(batch_embeddings) return batch_embedding_tensor, input_mask
def vectorize(self, batch, mode = "train"): ## TODO: Get single example, abstract out batchification batch_size = int(len(batch['utterance_list']) / batch['max_num_utterances']) max_num_utterances_batch = batch['max_num_utterances'] max_utterance_length = batch['max_utterance_length'] ## Prepare Token Embeddings token_embeddings, token_mask = self.token_encoder.lookup(batch) if self.args.use_cuda: token_embeddings = token_embeddings.cuda() input_mask_variable = variable(token_mask) ## Prepare Utterance Encoder ## Prepare Conversation Encoder ## TODO: Abstraction similar to token embeddings conversation_lengths = batch['conversation_lengths'] conversation_mask = variable(FloatTensor(batch['conversation_mask'])) ## Prepare Ouput (If exists) gold_next_bow_vectors = LongTensor(batch['next_bow_list']) gold_prev_bow_vectors = LongTensor(batch['prev_bow_list']) gold_next_bow_mask = LongTensor(batch['next_bow_mask']) gold_prev_bow_mask = LongTensor(batch['prev_bow_mask']) utterance_labels = LongTensor(batch['label']) if mode == "train": return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch, \ gold_next_bow_mask, gold_prev_bow_mask, gold_next_bow_vectors, gold_prev_bow_vectors, utterance_labels else: return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch
def vectorize(self, batch, mode="train"): batch_size = int( len(batch['utterance_list']) / batch['max_num_utterances']) max_num_utterances_batch = batch['max_num_utterances'] max_utterance_length = batch['max_utterance_length'] ## Prepare Token Embeddings # TODO: Batch has dummy utternances that need to be specifically handled incase of average elmo token_embeddings, token_mask = self.token_encoder.lookup(batch) if self.args.use_cuda: token_embeddings = token_embeddings.cuda() input_mask_variable = variable(token_mask) conversation_lengths = batch['conversation_lengths'] conversation_mask = variable(FloatTensor(batch['conversation_mask'])) ## For decoder prepare initial state conversation_ids = batch['utterance_word_ids'] start_state = variable(LongTensor([self.vocabulary.sos] * batch_size)) input = {} input["start_token_ids"] = start_state start_encoding = self.token_encoder.lookup_by_name( input, "start_token_ids") # Max utterance length will be the same for next and previous utterance lists as well # Needs access to the token encoder itself if mode == "train": return batch_size, token_embeddings, input_mask_variable, conversation_mask, \ max_num_utterances_batch, max_utterance_length, \ start_encoding, conversation_ids else: return batch_size, token_embeddings, input_mask_variable, conversation_mask, \ max_num_utterances_batch, max_utterance_length, \ start_encoding
def vectorize(self, batch, mode="train"): ## TODO: Get single example, abstract out batchification batch_size = int( len(batch['utterance_list']) / batch['max_num_utterances']) max_num_utterances_batch = batch['max_num_utterances'] ## Prepare Token Embeddings token_embeddings, token_mask = self.token_encoder.lookup(batch) if self.args.use_cuda: token_embeddings = token_embeddings.cuda() input_mask_variable = variable(token_mask) ## Prepare Utterance Encoder ## Prepare Conversation Encoder ## TODO: Abstraction similar to token embeddings conversation_lengths = batch['conversation_lengths'] conversation_mask = variable(FloatTensor(batch['conversation_mask'])) ## Prepare Ouput (If exists) ## TODO: Eliminate options tensor to make faster options_tensor = LongTensor(batch['utterance_options_list']) goldids_next_variable = LongTensor(batch['next_utterance_gold']) goldids_prev_variable = LongTensor(batch['prev_utterance_gold']) if mode == "train": return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch, \ options_tensor, goldids_next_variable, goldids_prev_variable else: return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch, \ options_tensor
def lookup_by_name(self, input, name_embed, name_mask = None): input_token_ids = LongTensor(input[name_embed]) utterance_embeddings = self.embed_layer(input_token_ids) if name_mask in input: input_mask = FloatTensor(input[name_mask]) else: input_mask = None return utterance_embeddings, input_mask
def vectorize(self, batch, mode="train"): batch_size = int( len(batch['utterance_list']) / batch['max_num_utterances']) max_num_utterances_batch = batch['max_num_utterances'] # TODO: Batch has dummy utternances that need to be specifically handled incase of average elmo token_embeddings, token_mask = self.token_encoder.lookup(batch) if self.args.use_cuda: token_embeddings = token_embeddings.cuda() input_mask_variable = variable(token_mask) conversation_lengths = batch['conversation_lengths'] conversation_mask = variable(FloatTensor(batch['conversation_mask'])) if mode == "train": return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch else: return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch
def vectorize(self, batch, mode = "train"): batch_size = int(len(batch['utterance_list']) / batch['max_num_utterances']) max_num_utterances_batch = batch['max_num_utterances'] max_utterance_length = batch['max_utterance_length'] ## Prepare Token Embeddings token_embeddings, token_mask = self.token_encoder.lookup(batch) if self.args.use_cuda: token_embeddings = token_embeddings.cuda() input_mask_variable = variable(token_mask) conversation_lengths = batch['conversation_lengths'] conversation_mask = variable(FloatTensor(batch['conversation_mask'])) ## Prepare Ouput (If exists) bow_list = LongTensor(batch['utterance_bow_list']) bow_mask = LongTensor(batch['utterance_bow_mask']) if mode == "train": return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch, \ bow_list, bow_mask else: return batch_size, token_embeddings, input_mask_variable, conversation_mask, max_num_utterances_batch
def lookup(self, input): input_token_ids = LongTensor(input["utterance_word_ids"]) utterance_embeddings = self.embed_layer(input_token_ids) input_mask = FloatTensor(input['input_mask']) return utterance_embeddings, input_mask