def _process_input(self, data_raw): questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) elif (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("unknown input_mask_mode") fact_count = len(input_mask) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) fact_counts.append(fact_count) input_masks.append(input_mask) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word( word=x["A"], # TODO: add .lower() here word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' :param data_raw: raw data (train or test set) from outils.get_babi_raw :return inputs: all the inputs, as a list of word vector representation. :return questions: all the questions, as a list of word vector repre. :return answers: all the answers, as a list of word vec repre :return input_masks: ''' questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] ans = x["A"].lower().split(' ') ans = [w for w in ans if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in inp ] #for each word, get the word vec rpz q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in q ] ans_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index", silent=True) for w in ans ] ans_vector = ans_vector[0:len(ans_vector)] if (len(ans_vector) == self.answer_step_nbr): inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(np.vstack(ans_vector).astype(floatX)) #TODO check what the heck input_masks is made of. if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception( "invalid input_mask_mode" ) #TODO this should probably not be raised here... return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' :param data_raw: raw data (train or test set) from outils.get_babi_raw :return inputs: all the inputs, as a list of word vector representation. :return questions: all the questions, as a list of word vector repre. :return answers: all the answers, as a list of word vec repre :return input_masks: ''' questions = [] inputs = [] answers = [] input_masks = [] pointers_s = [] pointers_e = [] for x in data_raw: inp = x["C"].lower().split(' ') normal_len = np.shape(inp)[0] inp = [w for w in inp if len(w) > 0] while (np.shape(inp)[0] < normal_len): inp.append(" <eoc>") q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] ans = x["A"].lower().split(' ') ans = [w for w in ans if len(w) > 0] pointers_s.append(x["Ps"]) pointers_e.append(x["Pe"]) inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] #for each word, get the word vec rpz q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) ans_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index") for w in ans ] ans_vector = ans_vector[0:len(ans_vector) - 1] answers.append(np.vstack(ans_vector).astype(floatX)) #print(np.shape(inp_vector)) if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': #Mask is here an array containing the index of '.' input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception( "invalid input_mask_mode" ) #TODO this should probably not be raised here... return inputs, questions, answers, input_masks, pointers_s, pointers_e
def _process_input(self, data_raw): inputs = [] questions = [] choices = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] pa = self._find_first(inp, 'a>') pb = self._find_first(inp, 'b>') pc = self._find_first(inp, 'c>') pd = self._find_first(inp, 'd>') assert (pa != -1 and pb != -1 and pc != -1 and pd != -1 and pa < pb and pb < pc and pc < pd) ca = inp[pa + 1:pb] cb = inp[pb + 1:pc] cc = inp[pc + 1:pd] cd = inp[pd + 1:] ca = ca[:self._find_first(ca, '.') + 1] cb = cb[:self._find_first(cb, '.') + 1] cc = cc[:self._find_first(cc, '.') + 1] cd = cd[:self._find_first(cd, '.') + 1] inp = inp[:pa] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in q ] choice_vectors = [ np.array([ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in choice ], dtype=floatX) for choice in [ca, cb, cc, cd] ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(ord(x['A'][0]) - ord('A')) choices.append(choice_vectors) # TODO: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, choices, input_masks