def next_batch(self, batch_size): """ """ video_ids, question_strs = [], [], questions, question_masks, question_lens = [], [], [], passages, passage_masks, passage_lens = [], [], [], scores = [] examples = self.data.next_items(batch_size) for video_id, question_str, passage_str, score in examples: video_ids.append(video_id) question_strs.append(question_str) # indexing passages passage = indexes_from_sentence(passage_str, self.dictionary, self.max_p_len) passage_mask = self._masking(passage) passage_len = min(len(word_tokenize(passage_str)), self.max_p_len) # indexing questions question = indexes_from_sentence(question_str, self.dictionary, self.max_q_len) question_mask = self._masking(question) question_len = min(len(word_tokenize(question_str)), self.max_q_len) # add to batch passages.append(passage) passage_masks.append(passage_mask) passage_lens.append(passage_len) questions.append(question) question_masks.append(question_mask) question_lens.append(question_len) scores.append(score) # build torch.tensor passages = torch.tensor(passages) passage_masks = torch.tensor(passage_masks) passage_lens = torch.tensor(passage_lens) questions = torch.tensor(questions) question_masks = torch.tensor(question_masks) question_lens = torch.tensor(question_lens) scores = torch.tensor(scores) if torch.cuda.is_available(): passages = passages.long().cuda(0) passage_masks = passage_masks.long().cuda(0) passage_lens = passage_lens.long().cuda(0) questions = questions.long().cuda(0) question_masks = question_masks.long().cuda(0) question_lens = question_lens.long().cuda(0) scores = scores.long().cuda(0) return video_ids, question_strs, \ questions, question_masks, question_lens, \ passages, passage_masks, passage_lens, \ scores
def next_batch(self, batch_size): pids, passages, passage_masks, passage_lens, sent_lens = [], [], [], [], [] questions, question_masks, question_lens = [], [], [], answers = [] examples = self.data.next_items(batch_size) for transcript, question_str, answer_start, answer_end in examples: # indexing passages passage = [ indexes_from_sentence(sent, self.dictionary, self.max_sent_len) for sent in transcript ] passage_len = len(transcript) passage_mask = [self._masking(sent) for sent in passage] sent_len = [ min(len(word_tokenize(sent)), 35) for sent in transcript ] # padding passage for _ in range(self.max_p_len - len(passage)): dummy_sent = indexes_from_sentence("temp", self.dictionary, self.max_sent_len) # dummy_sent = [self.dictionary.PAD_token] * self.max_sent_len passage.append(dummy_sent) zero_mask = [0] * self.max_sent_len passage_mask.append(zero_mask) # sent_len.append(0) # zero-length causes a problem. for now, we give 1. sent_len.append(1) # indexing questions question = indexes_from_sentence(question_str, self.dictionary, self.max_sent_len) question_mask = self._masking(question) question_len = min(len(word_tokenize(question_str)), 35) # indexing answer answer = encode_answer_index(answer_start, answer_end, self.max_p_len) # add to batch pids.append(uuid.uuid4()) # Not really used passages.append(passage) passage_masks.append(passage_mask) passage_lens.append(passage_len) sent_lens.append(sent_len) questions.append(question) question_masks.append(question_mask) question_lens.append(question_len) answers.append(answer) # build torch.tensor passages = torch.tensor(passages) passage_masks = torch.tensor(passage_masks) passage_lens = torch.tensor(passage_lens) sent_lens = torch.tensor(sent_lens) questions = torch.tensor(questions) question_masks = torch.tensor(question_masks) question_lens = torch.tensor(question_lens) answers = torch.tensor(answers) if torch.cuda.is_available(): passages = passages.long().cuda(0) passage_masks = passage_masks.long().cuda(0) passage_lens = passage_lens.long().cuda(0) sent_lens = sent_lens.long().cuda(0) questions = questions.long().cuda(0) question_masks = question_masks.long().cuda(0) question_lens = question_lens.long().cuda(0) answers = answers.long().cuda(0) return pids, passages, passage_masks, passage_lens, sent_lens, \ questions, question_masks, question_lens, \ answers
def _shorten_sent(self, sent): """ """ tokens = word_tokenize(sent) new_sent = " ".join(tokens[:self.max_sent_len]) return new_sent
def _add_to_vocab(sent, all_words): """ add words to vocabulary""" tokens = word_tokenize(sent) for token in tokens: all_words.add(token)