Exemple #1
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []

        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            elif (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']
            else:
                raise Exception("unknown input_mask_mode")
            fact_count = len(input_mask)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index"))
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        return inputs, questions, answers, fact_counts, input_masks
Exemple #2
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(
                    word=x["A"],  # TODO: add .lower() here 
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="index"))
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, input_masks
Exemple #3
0
    def _process_input(self, data_raw):
        '''
        :param data_raw: raw data (train or test set) from outils.get_babi_raw
        :return inputs: all the inputs, as a list of word vector representation. 
        :return questions: all the questions, as a list of word vector repre.
        :return answers: all the answers, as a list of word vec repre
        :return input_masks:
        '''
        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            ans = x["A"].lower().split(' ')
            ans = [w for w in ans if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in inp
            ]  #for each word, get the word vec rpz

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]

            ans_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index",
                                   silent=True) for w in ans
            ]

            ans_vector = ans_vector[0:len(ans_vector)]

            if (len(ans_vector) == self.answer_step_nbr):
                inputs.append(np.vstack(inp_vector).astype(floatX))
                questions.append(np.vstack(q_vector).astype(floatX))
                answers.append(np.vstack(ans_vector).astype(floatX))

                #TODO check what the heck input_masks is made of.
                if self.input_mask_mode == 'word':
                    input_masks.append(
                        np.array([index for index, w in enumerate(inp)],
                                 dtype=np.int32))
                elif self.input_mask_mode == 'sentence':
                    input_masks.append(
                        np.array(
                            [index for index, w in enumerate(inp) if w == '.'],
                            dtype=np.int32))
                else:
                    raise Exception(
                        "invalid input_mask_mode"
                    )  #TODO this should probably not be raised here...

        return inputs, questions, answers, input_masks
Exemple #4
0
    def _process_input(self, data_raw):
        '''
        :param data_raw: raw data (train or test set) from outils.get_babi_raw
        :return inputs: all the inputs, as a list of word vector representation. 
        :return questions: all the questions, as a list of word vector repre.
        :return answers: all the answers, as a list of word vec repre
        :return input_masks:
        '''
        questions = []
        inputs = []
        answers = []
        input_masks = []
        pointers_s = []
        pointers_e = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            normal_len = np.shape(inp)[0]
            inp = [w for w in inp if len(w) > 0]
            while (np.shape(inp)[0] < normal_len):
                inp.append(" <eoc>")
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            ans = x["A"].lower().split(' ')
            ans = [w for w in ans if len(w) > 0]

            pointers_s.append(x["Ps"])
            pointers_e.append(x["Pe"])

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]  #for each word, get the word vec rpz

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))

            ans_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index") for w in ans
            ]

            ans_vector = ans_vector[0:len(ans_vector) - 1]
            answers.append(np.vstack(ans_vector).astype(floatX))

            #print(np.shape(inp_vector))

            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                #Mask is here an array containing the index of '.'
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception(
                    "invalid input_mask_mode"
                )  #TODO this should probably not be raised here...

        return inputs, questions, answers, input_masks, pointers_s, pointers_e
Exemple #5
0
    def _process_input(self, data_raw):
        inputs = []
        questions = []
        choices = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            pa = self._find_first(inp, 'a>')
            pb = self._find_first(inp, 'b>')
            pc = self._find_first(inp, 'c>')
            pd = self._find_first(inp, 'd>')

            assert (pa != -1 and pb != -1 and pc != -1 and pd != -1 and pa < pb
                    and pb < pc and pc < pd)

            ca = inp[pa + 1:pb]
            cb = inp[pb + 1:pc]
            cc = inp[pc + 1:pd]
            cd = inp[pd + 1:]
            ca = ca[:self._find_first(ca, '.') + 1]
            cb = cb[:self._find_first(cb, '.') + 1]
            cc = cc[:self._find_first(cc, '.') + 1]
            cd = cd[:self._find_first(cd, '.') + 1]

            inp = inp[:pa]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]

            choice_vectors = [
                np.array([
                    utils.process_word(word=w,
                                       word2vec=self.word2vec,
                                       vocab=self.vocab,
                                       ivocab=self.ivocab,
                                       word_vector_size=self.word_vector_size,
                                       to_return="word2vec",
                                       silent=True) for w in choice
                ],
                         dtype=floatX) for choice in [ca, cb, cc, cd]
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(ord(x['A'][0]) - ord('A'))
            choices.append(choice_vectors)

            # TODO: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, choices, input_masks