def _process_input(data_raw, word2vec, vocab, ivocab, word_vector_size):
    sent_len = []
    gate_len = []
    for x in data_raw:
        inp = x["C"].lower().split(' ') 
        inp = [w for w in inp if len(w) > 0]
        q = x["Q"].lower().split(' ')
        q = [w for w in q if len(w) > 0]
        
        inp_vector = [utils.process_word(word = w.lower(), 
                                    word2vec = word2vec, 
                                    vocab = vocab, 
                                    ivocab = ivocab, 
                                    word_vector_size = word_vector_size, 
                                    to_return = "word2vec") for w in inp]
        sent_len.append(len(inp_vector))                            
        q_vector = [utils.process_word(word = w.lower(), 
                                    word2vec = word2vec, 
                                    vocab = vocab, 
                                    ivocab = ivocab, 
                                    word_vector_size = word_vector_size, 
                                    to_return = "word2vec") for w in q]        
        utils.process_word(word = x["A"].lower(), # TODO: add .lower() here 
                                    word2vec = word2vec, 
                                    vocab = vocab, 
                                    ivocab = ivocab, 
                                    word_vector_size = word_vector_size, 
                                    to_return = "index")
        gate_len.append(len(x["S"]))                       
    return sent_len, gate_len
Beispiel #2
0
def _process_input(data_raw, word2vec, vocab, ivocab, word_vector_size):
    sent_len = []
    gate_len = []
    for x in data_raw:
        inp = x["C"].lower().split(' ')
        inp = [w for w in inp if len(w) > 0]
        q = x["Q"].lower().split(' ')
        q = [w for w in q if len(w) > 0]

        inp_vector = [
            utils.process_word(word=w.lower(),
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec") for w in inp
        ]
        sent_len.append(len(inp_vector))
        q_vector = [
            utils.process_word(word=w.lower(),
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec") for w in q
        ]
        utils.process_word(
            word=x["A"].lower(),  # TODO: add .lower() here 
            word2vec=word2vec,
            vocab=vocab,
            ivocab=ivocab,
            word_vector_size=word_vector_size,
            to_return="index")
        gate_len.append(len(x["S"]))
    return sent_len, gate_len
Beispiel #3
0
def process_input(data1):
	q = []
	a = []
	sent = []
	for i in range(0,len(data1)):
		s1 = data1[i][0]
		s2 = data1[i][1]
		s1 = s1.strip()
		s1 = repl(s1)
		s1 = s1.lower().split(' ')
		s1 = [w for w in s1 if len(w) > 0]
		s1_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s1]
		s2 = s2.strip()
		s2 = repl(s2)
		s2 = s2.lower().split(' ')
		s2 = [w for w in s2 if len(w) > 0]
		s2_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s2]
		q.append(np.vstack(s1_vector))
		a.append(np.vstack(s2_vector))
		sent.append(data1[i][1])
	q = np.asarray(q)
	a = np.asarray(a)
	sent = np.asarray(sent)
	return q,a,sent
Beispiel #4
0
def encode_document(doc, vocab_2_idx, sos='<sos>', eos='<eos>'):
    '''
  Encodes a document (string) based on the given mapping (vocab_2_idx)

  Params:
    * doc : string, document to encode, it suppose that the token separator is a space
    * vocab_2_idx : dictionary, string to index
    * sos (optional) : string, Start Of Sentence token
    * eos (optional) : string, End Of Sentence token
  
  Returns:
    * doc_encoded : list of int
  '''
    doc_encoded = []
    for w in doc.split(' '):
        # handle trigrams
        encoded = u.process_word(w, vocab_2_idx)
        # handle bigrams
        for i, wp in enumerate(encoded):
            if not wp[1]:
                encoded[i] = u.process_word(wp[0], vocab_2_idx, n=2)

        encoded = u.flat_list(encoded)
        # handle unigrams
        encoded = [[(c, True) for c in wp[0]] if not wp[1] else wp
                   for wp in encoded]
        encoded = u.flat_list(encoded)

        doc_encoded += [vocab_2_idx[wp] for wp, _ in encoded]
        doc_encoded.append(vocab_2_idx[' '])

    doc_encoded = [vocab_2_idx[sos]] + doc_encoded[:-1] + [vocab_2_idx[eos]]
    return doc_encoded
Beispiel #5
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        gates = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            inputs.append(
                np.vstack(inp_vector).astype(floatX))  #(seq_len, vocab)
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(
                    word=x["A"],  # TODO: add .lower() here 
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="index"))
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                mask = [index for index, w in enumerate(inp)]
            elif self.input_mask_mode == 'sentence':
                mask = [index for index, w in enumerate(inp) if w == '.']
            else:
                raise Exception("invalid input_mask_mode")
            mask.append(mask[-1] + 1)
            input_masks.append(np.array(mask, dtype=np.int32))

            gate = [w for w in x["S"]]
            for i in xrange(len(gate), self.memory_hops):
                gate.append(len(mask) - 1)
            gates.append(np.array(gate, dtype=np.int32))

        return inputs, questions, answers, input_masks, gates
Beispiel #6
0
def _process_input(data_raw):
        questions = []
        inputs = []
        answers = []
        input_masks = []
	supp_fact = []

        for x in data_raw:
            inp = x["C"].lower().split(' ') 
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            sf = x["SF"]
	    SF = []
	    for i in range(0,hops):
		if i<len(sf):
	    		SF.append(sf[i])
		else:
			SF.append(sf[len(sf)-1])
            inp_vector = [utils.process_word(word = w, 
                                        word2vec = word2vec, 
                                        vocab = vocab, 
                                        ivocab = ivocab, 
                                        word_vector_size = word_vector_size, 
                                        to_return = "word2vec") for w in inp]
                                        
            q_vector = [utils.process_word(word = w, 
                                        word2vec = word2vec, 
                                        vocab = vocab, 
                                        ivocab = ivocab, 
                                        word_vector_size = word_vector_size, 
                                        to_return = "word2vec") for w in q]
            
            inputs.append(np.vstack(inp_vector))
            questions.append(np.vstack(q_vector))
            supp_fact.append(SF)
            answers.append(utils.process_word(word = x["A"], 
                                            word2vec = word2vec, 
                                            vocab = vocab, 
                                            ivocab = ivocab, 
                                            word_vector_size = word_vector_size, 
                                            to_return = "index"))
            # NOTE: here we assume the answer is one word! 
            if input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) 
            elif input_mask_mode == 'sentence': 
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) 
            else:
                raise Exception("invalid input_mask_mode")
        inputs = np.asarray(inputs)
	questions = np.asarray(questions)
	answers = np.asarray(answers)
	input_masks = np.asarray(input_masks)
	supp_fact = np.asarray(supp_fact)
        return inputs, questions, answers, input_masks, supp_fact
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []
        img_features=[]
        for x in data_raw:
            #inp = x["C"].lower().split(' ')
            x["C"]=x["C"].lower()
            inp=re.split("[, \-!?:'\/]+",x["C"])
            inp = [w for w in inp if len(w) > 0]
            x["Q"]=x["Q"].lower()
            q = re.split("[, \-!?:'\/]+",x["Q"])
            q = [w for w in q if len(w) > 0]

            inp_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in inp]

            q_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in q]

            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            elif (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']
            else:
                raise Exception("unknown input_mask_mode")
            fact_count = len(input_mask)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            #answers.append(utils.process_word(word = x["A"],
            #                                word2vec = self.word2vec,
            #                                vocab = self.vocab,
            #                                ivocab = self.ivocab,
            #                                word_vector_size = self.word_vector_size,
            #                                to_return = "index"))
            answers.append(x["A"])
            fact_counts.append(fact_count)
            input_masks.append(input_mask)
            img_features.append(x["I"])

        return inputs, questions, answers, fact_counts, input_masks , img_features
Beispiel #8
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []

        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            elif (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']
            else:
                raise Exception("unknown input_mask_mode")
            fact_count = len(input_mask)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index"))
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        return inputs, questions, answers, fact_counts, input_masks
Beispiel #9
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(
                    word=x["A"],  # TODO: add .lower() here 
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="index"))
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, input_masks
Beispiel #10
0
    def _process_input(self, data_raw):
        '''
            This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

        Args:
            data_raw: raw data coming in from main class.
        Returns:
            inputs section, answers section, questions section, and input_masks as numpy arrays.
        '''
        inputs = []
        answers = []
        input_masks = []
        questions = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
            inp_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec", silent=True) for w in inp]
            
            q_vector = [utils.process_word(word = w,
		    			word2vec = self.word2vec,
					vocab = self.vocab,
					ivocab = self.ivocab,
					word_vector_size = self.word_vector_size,
					to_return = "word2vec", silent=True) for w in q]
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(utils.process_word(word = x["A"],
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "index"))

            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data
            elif self.input_mask_mode == 'sentence':
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")
        
        return inputs, questions, answers, input_masks
Beispiel #11
0
    def _process_input(self, data_raw):
    '''
    	This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

	Args:
		data_raw: raw data coming in from main class.
	Returns:
		inputs section, answers section, questions section, and input_masks as numpy arrays.
    '''
	inputs = []
        answers = []
        input_masks = []
	questions = []
        for x in data_raw:
            inp = x["C"].lower().split(' ') 
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
	    q = [w for w in q if len(w) > 0]

            # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
	    inp_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec") for w in inp]
            
	    q_vector = [utils.process_word(word = w,
		    			word2vec = self.word2vec,
					vocab = self.vocab,
					ivocab = self.ivocab,
					word_vector_size = self.word_vector_size,
					to_return = "word2vec") for w in q]
            inputs.append(np.vstack(inp_vector).astype(floatX))
	    questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(utils.process_word(word = x["A"], 
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "index"))
            
	    # NOTE: here we assume the answer is one word! 
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data
            elif self.input_mask_mode == 'sentence': 
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) 
            else:
                raise Exception("invalid input_mask_mode")
        
        return inputs, questions, answers, input_masks
Beispiel #12
0
 def _process_input(self, data_raw):
     questions = []
     inputs = []
     answers = []
     gates = []
     input_masks = []
     for x in data_raw:
         inp = x["C"].lower().split(' ') 
         inp = [w for w in inp if len(w) > 0]
         q = x["Q"].lower().split(' ')
         q = [w for w in q if len(w) > 0]
         
         inp_vector = [utils.process_word(word = w, 
                                     word2vec = self.word2vec, 
                                     vocab = self.vocab, 
                                     ivocab = self.ivocab, 
                                     word_vector_size = self.word_vector_size, 
                                     to_return = "word2vec") for w in inp]
                                     
         q_vector = [utils.process_word(word = w, 
                                     word2vec = self.word2vec, 
                                     vocab = self.vocab, 
                                     ivocab = self.ivocab, 
                                     word_vector_size = self.word_vector_size, 
                                     to_return = "word2vec") for w in q]
         
         inputs.append(np.vstack(inp_vector).astype(floatX)) #(seq_len, vocab)
         questions.append(np.vstack(q_vector).astype(floatX))
         answers.append(utils.process_word(word = x["A"], # TODO: add .lower() here 
                                         word2vec = self.word2vec, 
                                         vocab = self.vocab, 
                                         ivocab = self.ivocab, 
                                         word_vector_size = self.word_vector_size, 
                                         to_return = "index"))
         # NOTE: here we assume the answer is one word! 
         if self.input_mask_mode == 'word':
             mask = [index for index, w in enumerate(inp)]
         elif self.input_mask_mode == 'sentence': 
             mask = [index for index, w in enumerate(inp) if w == '.']
         else:
             raise Exception("invalid input_mask_mode")            
         mask.append(mask[-1]+1)
         input_masks.append(np.array(mask, dtype=np.int32))
         
         gate =[w for w in x["S"]]
         for i in xrange(len(gate),self.memory_hops):
             gate.append(len(mask)-1) 
         gates.append(np.array(gate,dtype=np.int32))
         
     return inputs, questions, answers, input_masks, gates
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in inp]

            q_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in q]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(utils.process_word(word = x["A"],
                                            word2vec = self.word2vec,
                                            vocab = self.vocab,
                                            ivocab = self.ivocab,
                                            word_vector_size = self.word_vector_size,
                                            to_return = "index"))
#            print("Context\n ---------------------------------")
#            print( x["C"] )
#            print("Question \n -------------------------------")
#            print( x["Q"] )
#            print("Answer \n----------------------------------")
#            print( x["A"] )
#            wait=raw_input("Press Enter To Continue")
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, input_masks
Beispiel #14
0
def process_input(data1):
    sents1 = []
    sents2 = []
    sim = []
    for i in range(1, len(data1)):
        s1 = data1[i][0]
        s2 = data1[i][1]
        score = float(data1[i][2])
        s1 = s1.strip()
        s1 = repl(s1)
        s1 = s1.lower().split(' ')
        s1 = [w for w in s1 if len(w) > 0]
        s1_vector = [
            utils.process_word(word=w,
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec",
                               silent=True) for w in s1
        ]
        s2 = s2.strip()
        s2 = repl(s2)
        s2 = s2.lower().split(' ')
        s2 = [w for w in s2 if len(w) > 0]
        s2_vector = [
            utils.process_word(word=w,
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec",
                               silent=True) for w in s2
        ]
        sents1.append(np.vstack(s1_vector))
        sents2.append(np.vstack(s2_vector))
        minn = 0.000001
        p = np.array([minn, minn, minn, minn, minn])
        if score == 5.0:
            p[4] = 1.0 - 4 * minn
        else:
            p[int(np.floor(score))] = score - np.floor(score) + minn
            p[int(np.floor(score)) - 1] = 1 + np.floor(score) - score + minn
        sim.append(p)
    sents1 = np.asarray(sents1)
    sents2 = np.asarray(sents2)
    sim = np.asarray(sim)
    return sents1, sents2, sim
Beispiel #15
0
def process_input_3(data1, data2):

    question = []
    answer = []

    for i in range(0, len(data1)):
        lines = data1[i]
        q = lines[0]
        q = q.strip()
        q = repl(q)
        q = q.lower().split(' ')
        q = [w for w in q if len(w) > 0]
        q_vector = [
            utils.process_word(word=w,
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec",
                               silent=True) for w in q
        ]
        for j in range(0, len(data2)):
            a = data2[j][0]
            a = a.strip()
            a = repl(a)
            a = a.lower().split(' ')
            a = [w for w in a if len(w) > 0]
            a_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in a
            ]
            question.append(np.vstack(q_vector))
            answer.append(np.vstack(a_vector))

    question = np.asarray(question)
    answer = np.asarray(answer)
    print "processing data done ! ********************************"
    return question, answer
Beispiel #16
0
def process_input(data1):
	sents1 = []
	sents2 = []
	sim = []
	for i in range(1,len(data1)):
		s1 = data1[i][0]
		s2 = data1[i][1]
		score = float(data1[i][2])
		s1 = s1.strip()
		s1 = repl(s1)
		s1 = s1.lower().split(' ')
		s1 = [w for w in s1 if len(w) > 0]
		s1_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s1]
		s2 = s2.strip()
		s2 = repl(s2)
		s2 = s2.lower().split(' ')
		s2 = [w for w in s2 if len(w) > 0]
		s2_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s2]
		sents1.append(np.vstack(s1_vector))
		sents2.append(np.vstack(s2_vector))
		minn = 0.000001
		p = np.array([minn,minn,minn,minn,minn])
		if score == 5.0:
			p[4] = 1.0 - 4*minn
		else:
			p[int(np.floor(score))] = score - np.floor(score) + minn
			p[int(np.floor(score)) - 1] = 1 + np.floor(score) - score + minn
		sim.append(p)
	sents1 = np.asarray(sents1)
	sents2 = np.asarray(sents2)
	sim = np.asarray(sim)
	return sents1,sents2,sim
Beispiel #17
0
def process_input_3(data1,data2):
	
	question = []
	answer = []

	for i in range(0,len(data1)):
		lines = data1[i]
		q = lines[0]
		q = q.strip()
		q = repl(q)
		q = q.lower().split(' ')
		q = [w for w in q if len(w) > 0]
		q_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in q]
		for j in range(0,len(data2)):
			a = data2[j][0]
			a = a.strip()
			a = repl(a)
			a = a.lower().split(' ')
			a = [w for w in a if len(w) > 0]
			a_vector = [utils.process_word(word = w, 
		                                word2vec = word2vec, 
		                                vocab = vocab, 
		                                ivocab = ivocab, 
		                                word_vector_size = word_vector_size, 
		                                to_return = "word2vec",silent=True) for w in a]
			question.append(np.vstack(q_vector))
			answer.append(np.vstack(a_vector))		
	
	question = np.asarray(question)
	answer = np.asarray(answer)
	print "processing data done ! ********************************"
	return question,answer
Beispiel #18
0
def process_input(data1):
	sents1 = []
	sents2 = []
	sim = []
	for i in range(1,len(data1)):
		s1 = data1[i][0]
		s2 = data1[i][1]
		score = float(data1[i][2])
		s1 = s1.strip()
		s1 = repl(s1)
		s1 = s1.lower().split(' ')
		s1 = [w for w in s1 if len(w) > 0]
		s1_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s1]
		s2 = s2.strip()
		s2 = repl(s2)
		s2 = s2.lower().split(' ')
		s2 = [w for w in s2 if len(w) > 0]
		s2_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in s2]
		sents1.append(np.vstack(s1_vector))
		sents2.append(np.vstack(s2_vector))
		p = (score - 1)/4.0
		sim.append(p)
	sents1 = np.asarray(sents1)
	sents2 = np.asarray(sents2)
	sim = np.asarray(sim)
	return sents1,sents2,sim
Beispiel #19
0
def process_input(data1, data2):

    question = []
    answer = []
    truth = []
    cnt = 0
    combined = zip(data1, data2)
    random.shuffle(combined)
    data1, data2 = zip(*combined)
    for i in range(0, 167):

        lines = data1[i]
        q = lines[0]
        a = lines[1]
        rand = random.sample(range(0, 260), 5)
        q = q.strip()
        q = repl(q)
        a = a.strip()
        a = repl(a)
        truth.append(1)
        q = q.lower().split(' ')
        q = [w for w in q if len(w) > 0]
        q_vector = [
            utils.process_word(word=w,
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec",
                               silent=True) for w in q
        ]
        question.append(np.vstack(q_vector))
        a = a.lower().split(' ')
        a = [w for w in a if len(w) > 0]
        a_vector = [
            utils.process_word(word=w,
                               word2vec=word2vec,
                               vocab=vocab,
                               ivocab=ivocab,
                               word_vector_size=word_vector_size,
                               to_return="word2vec",
                               silent=True) for w in a
        ]
        answer.append(np.vstack(a_vector))
        for j in range(0, 5):
            a_rand = data2[rand[j]][0]
            a_rand = a_rand.strip()
            a_rand = repl(a_rand)
            a_rand = a_rand.lower().split(' ')
            a_rand = [w for w in a_rand if len(w) > 0]
            a_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in a_rand
            ]
            if rand[j] == i:
                rand1 = random.sample(range(0, 260), 1)
                a_rand = data2[rand1[0]][0]
                a_rand = a_rand.strip()
                a_rand = repl(a_rand)
                a_rand = a_rand.lower().split(' ')
                a_rand = [w for w in a_rand if len(w) > 0]
                a_vector = [
                    utils.process_word(word=w,
                                       word2vec=word2vec,
                                       vocab=vocab,
                                       ivocab=ivocab,
                                       word_vector_size=word_vector_size,
                                       to_return="word2vec",
                                       silent=True) for w in a_rand
                ]
                if rand1[0] == i:
                    cnt = cnt + 1
            question.append(np.vstack(q_vector))
            answer.append(np.vstack(a_vector))
            truth.append(0)
    question = np.asarray(question)
    answer = np.asarray(answer)
    truth = np.asarray(truth)
    print "processing data done ! ********************************"
    return question, answer, truth
Beispiel #20
0
def process_input_2(data):
    questions1 = []
    answers1 = []
    truth1 = []
    questions0 = []
    answers0 = []
    truth0 = []
    questions = []
    answers = []
    truth = []

    for i in range(1, len(data)):
        lines = data[i]
        q = lines[1]
        a = lines[5]
        t = lines[6]
        if t == "0":
            truth0.append(0)
            q = q.strip()
            q = repl(q)
            q = q.lower().split(' ')
            q = [w for w in q if len(w) > 0]
            q_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]
            questions0.append(np.vstack(q_vector))
            a = a.strip()
            a = repl(a)
            a = a.lower().split(' ')
            a = [w for w in a if len(w) > 0]
            a_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in a
            ]
            answers0.append(np.vstack(a_vector))
        if t == "1":
            truth1.append(1)
            q = q.strip()
            q = repl(q)
            q = q.lower().split(' ')
            q = [w for w in q if len(w) > 0]
            q_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]
            questions1.append(np.vstack(q_vector))
            a = a.strip()
            a = repl(a)
            a = a.lower().split(' ')
            a = [w for w in a if len(w) > 0]
            a_vector = [
                utils.process_word(word=w,
                                   word2vec=word2vec,
                                   vocab=vocab,
                                   ivocab=ivocab,
                                   word_vector_size=word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in a
            ]
            answers1.append(np.vstack(a_vector))
    questions1 = np.asarray(questions1)
    answers1 = np.asarray(answers1)
    truth1 = np.asarray(truth1)
    questions1, answers1, truth1 = shuffle(questions1, answers1, truth1)
    questions0 = np.asarray(questions0)
    answers0 = np.asarray(answers0)
    truth0 = np.asarray(truth0)
    questions0, answers0, truth0 = shuffle(questions0, answers0, truth0)
    for i in range(0, 700):
        questions.append(questions1[i])
        answers.append(answers1[i])
        truth.append(truth1[i])
    for i in range(0, 2100):
        questions.append(questions0[i])
        answers.append(answers0[i])
        truth.append(truth0[i])
    questions, answers, truth = shuffle(questions, answers, truth)
    questions, answers, truth = shuffle(questions, answers, truth)
    questions = np.asarray(questions)
    answers = np.asarray(answers)
    truth = np.asarray(truth)
    print "processing data done ! ********************************"
    return questions, answers, truth
    def _process_input(self, data_raw):
        max_inp_len = 0
        max_q_len = 0
        self.max_fact_count = 0
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            elif (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])
            else:
                raise Exception("unknown input_mask_mode")

            max_inp_len = max(max_inp_len, len(inp))
            max_q_len = max(max_q_len, len(q))
            self.max_fact_count = max(self.max_fact_count, fact_count)

        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []

        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            if (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in inp
            ]
            while (len(inp_vector) < max_inp_len):
                inp_vector.append(self._empty_word_vector())

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec") for w in q
            ]

            while (len(q_vector) < max_q_len):
                q_vector.append(self._empty_word_vector())

            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            if (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']

            while (len(input_mask) < self.max_fact_count):
                input_mask.append(-1)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index"))
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        inputs = np.array(inputs).astype(floatX)
        questions = np.array(questions).astype(floatX)
        answers = np.array(answers).astype(np.int32)
        fact_counts = np.array(fact_counts).astype(np.int32)
        input_masks = np.array(input_masks).astype(np.int32)

        return inputs, questions, answers, fact_counts, input_masks
    def _process_input(self, data_raw):
        max_inp_sent_len = 0.
        max_inp_num_sents = 0.
        max_q_len = 0.
        self.max_fact_count = 0.
        for x in data_raw:

            #this splits it into sentences
            sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            x["C"] = sent_detector.tokenize(x["C"])
            
            inp = []
            for i in range(len(x["C"])): 
                inp.append(x["C"][i].lower().split(' ')) 
                inp[i] = [w for w in inp[i] if len(w) > 0]
                max_inp_sent_len = max(max_inp_sent_len, len(inp[i]))

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            
            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            elif (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])
            else:
                raise Exception("unknown input_mask_mode")
            
            max_inp_num_sents = max(max_inp_num_sents, len(inp))
            max_q_len = max(max_q_len, len(q))
            self.max_fact_count = max(self.max_fact_count, fact_count)

        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:

            #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            #x["C"] = sent_detector.tokenize(x["C"])
            
            inp = []
            for i in range(len(x["C"])): 
                inp.append(x["C"][i].lower().split(' ')) 
                inp[i] = [w for w in inp[i] if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            
            inp_vector = []
            for i in range(len(inp)):            
                
                inp_i = [utils.process_word(word = inp[i][w], 
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "index") for w in range(len(inp[i]))]

                inp_vector.append(inp_i)

                #is this still needed?
                while(len(inp_vector[i]) < max_inp_sent_len):
                    inp_vector[i].append(0)
                #'''

                '''
                #VERSION FOR IF YOU SCRAP SENTENCE ENCODER 
                while(len(inp_vector[i]) < 80):
                    inp_vector[i].append(0)
                #'''
            
            #'''
            #is this still needed?
            while (len(inp_vector) < max_inp_num_sents):
                inp_vector.append([0] * (max_inp_sent_len))
            #'''

            '''
            #VERSION FOR IF YOU SCRAP SENTENCE ENCODER 
            while (len(inp_vector) < max_inp_num_sents):
                inp_vector.append([0] * (80))
            #'''
                                        
            q_vector = [utils.process_word(word = q[w], 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "index") for w in range(len(q))]
                                        
            '''
            q_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec") for w in q]
                                        '''

            while(len(q_vector) < max_q_len):
                q_vector.append(0)


            inputs.append(inp_vector)
            questions.append(q_vector)

            answers.append(utils.process_word(word = x["A"], 
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "index"))

            # NOTE: here we assume the answer is one word! 
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.float32)) 
            elif self.input_mask_mode == 'sentence': 
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.float32)) 
            else:
                raise Exception("invalid input_mask_mode")

        '''#THIS TURN ON ONE HOT ENCODER
        #inputs = utils.one_hot_encoding(s, self.sent_vector_size, embedding_size)
        inputs = utils.one_hot_encoding_trip(inputs, vocab_size, max_inp_sent_len)
        questions = utils.one_hot_encoding_doub(questions, vocab_size, max_q_len)   
        #'''

        inputs = np.array(inputs).astype(floatX)
        questions = np.array(questions).astype(floatX)

        self.max_inp_sent_len = max_inp_sent_len
        self.max_q_len = max_q_len

        return inputs, questions, answers, input_masks
    def _process_input(self, data_raw):
        max_inp_len = 0
        max_q_len = 0
        self.max_fact_count = 0
        for x in data_raw:
            inp = x["C"].lower().split(' ') 
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            
            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            elif (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])
            else:
                raise Exception("unknown input_mask_mode")
            
            max_inp_len = max(max_inp_len, len(inp))
            max_q_len = max(max_q_len, len(q))
            self.max_fact_count = max(self.max_fact_count, fact_count)
        
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []
        
        for x in data_raw:
            inp = x["C"].lower().split(' ') 
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            
            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            if (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])
             
            inp_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec") for w in inp]
            while(len(inp_vector) < max_inp_len):
                inp_vector.append(self._empty_word_vector())
    
    
            q_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec") for w in q]
            
            while(len(q_vector) < max_q_len):
                q_vector.append(self._empty_word_vector())
    
            
            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            if (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']

            while(len(input_mask) < self.max_fact_count):
                input_mask.append(-1)
            
            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word! 
            answers.append(utils.process_word(word = x["A"], 
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "index"))
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        
        inputs = np.array(inputs).astype(floatX)
        questions = np.array(questions).astype(floatX)
        answers = np.array(answers).astype(np.int32)
        fact_counts = np.array(fact_counts).astype(np.int32)
        input_masks = np.array(input_masks).astype(np.int32)
        
        return inputs, questions, answers, fact_counts, input_masks 
Beispiel #24
0
    def _process_input(self, data_raw):
        inputs = []
        questions = []
        choices = []
        answers = []
        input_masks = []
        for x in data_raw:
            x["C"] += sequence_classifier.get_sequence(x["Q"].lower() + ' ' +
                                                       x["A1"] + ' ' + x["A2"])
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            #pa = self._find_first(inp, 'a>')
            #pb = self._find_first(inp, 'b>')
            #pc = self._find_first(inp, 'c>')
            #pd = self._find_first(inp, 'd>')

            #assert (pa != -1 and pb != -1 #and pc != -1 and pd != -1
            #        and pa < pb )#and pb < pc and pc < pd)

            ca = x["A1"].replace('.', '').lower().split(' ')
            cb = x["A2"].replace('.', '').lower().split(' ')

            ca = [w for w in ca if len(w) > 0]
            cb = [w for w in cb if len(w) > 0]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]

            choice_vectors = [
                np.array([
                    utils.process_word(word=w,
                                       word2vec=self.word2vec,
                                       vocab=self.vocab,
                                       ivocab=self.ivocab,
                                       word_vector_size=self.word_vector_size,
                                       to_return="word2vec",
                                       silent=True) for w in choice
                ],
                         dtype=floatX) for choice in [ca, cb]
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(x['A'])
            choices.append(choice_vectors)

            # TODO: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, choices, input_masks
Beispiel #25
0
def process_input(data1,data2):
	
	question = []
	answer = []
	truth = []
	cnt = 0
	combined = zip(data1,data2)
        random.shuffle(combined)
        data1,data2 = zip(*combined)
	for i in range(0,167):
		
		lines = data1[i]
		q = lines[0]
		a = lines[1]
		rand = random.sample(range(0, 260), 5)
		q = q.strip()
		q = repl(q)
		a = a.strip()
		a = repl(a)
		truth.append(1)
		q = q.lower().split(' ')
		q = [w for w in q if len(w) > 0]
		q_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in q]
		question.append(np.vstack(q_vector))
		a = a.lower().split(' ')
		a = [w for w in a if len(w) > 0]
		a_vector = [utils.process_word(word = w, 
	                                word2vec = word2vec, 
	                                vocab = vocab, 
	                                ivocab = ivocab, 
	                                word_vector_size = word_vector_size, 
	                                to_return = "word2vec",silent=True) for w in a]
		answer.append(np.vstack(a_vector))
		for j in range(0,5):
			a_rand = data2[rand[j]][0]
			a_rand = a_rand.strip()
			a_rand = repl(a_rand)
			a_rand = a_rand.lower().split(' ')
			a_rand = [w for w in a_rand if len(w) > 0]
			a_vector = [utils.process_word(word = w, 
			                        word2vec = word2vec, 
			                        vocab = vocab, 
			                        ivocab = ivocab, 
			                        word_vector_size = word_vector_size, 
			                        to_return = "word2vec",silent=True) for w in a_rand]
			if rand[j]==i:
				rand1 = random.sample(range(0, 260), 1)
				a_rand = data2[rand1[0]][0]
				a_rand = a_rand.strip()
				a_rand = repl(a_rand)
				a_rand = a_rand.lower().split(' ')
				a_rand = [w for w in a_rand if len(w) > 0]
				a_vector = [utils.process_word(word = w, 
					                word2vec = word2vec, 
					                vocab = vocab, 
					                ivocab = ivocab, 
					                word_vector_size = word_vector_size, 
					                to_return = "word2vec",silent=True) for w in a_rand]
				if rand1[0] == i:
					cnt = cnt + 1
			question.append(np.vstack(q_vector))
			answer.append(np.vstack(a_vector))
			truth.append(0)
	question = np.asarray(question)
	answer = np.asarray(answer)
	truth = np.asarray(truth)
	print "processing data done ! ********************************"
	return question,answer,truth
    def _process_input(self, data_raw):
        max_inp_sent_len = 0.
        max_inp_num_sents = 0.
        max_q_len = 0.
        self.max_fact_count = 0.
        for x in data_raw:

            #this splits it into sentences
            sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            x["C"] = sent_detector.tokenize(x["C"])

            inp = []
            for i in range(len(x["C"])):
                inp.append(x["C"][i].lower().split(' '))
                inp[i] = [w for w in inp[i] if len(w) > 0]
                max_inp_sent_len = max(max_inp_sent_len, len(inp[i]))

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            if (self.input_mask_mode == 'word'):
                fact_count = len(inp)
            elif (self.input_mask_mode == 'sentence'):
                fact_count = len([0 for w in inp if w == '.'])
            else:
                raise Exception("unknown input_mask_mode")

            max_inp_num_sents = max(max_inp_num_sents, len(inp))
            max_q_len = max(max_q_len, len(q))
            self.max_fact_count = max(self.max_fact_count, fact_count)

        questions = []
        inputs = []
        answers = []
        input_masks = []
        for x in data_raw:

            #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            #x["C"] = sent_detector.tokenize(x["C"])

            inp = []
            for i in range(len(x["C"])):
                inp.append(x["C"][i].lower().split(' '))
                inp[i] = [w for w in inp[i] if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = []
            for i in range(len(inp)):

                inp_i = [
                    utils.process_word(word=inp[i][w],
                                       word2vec=self.word2vec,
                                       vocab=self.vocab,
                                       ivocab=self.ivocab,
                                       word_vector_size=self.word_vector_size,
                                       to_return="index")
                    for w in range(len(inp[i]))
                ]

                inp_vector.append(inp_i)

                #is this still needed?
                while (len(inp_vector[i]) < max_inp_sent_len):
                    inp_vector[i].append(0)
                #'''
                '''
                #VERSION FOR IF YOU SCRAP SENTENCE ENCODER 
                while(len(inp_vector[i]) < 80):
                    inp_vector[i].append(0)
                #'''

            #'''
            #is this still needed?
            while (len(inp_vector) < max_inp_num_sents):
                inp_vector.append([0] * (max_inp_sent_len))
            #'''
            '''
            #VERSION FOR IF YOU SCRAP SENTENCE ENCODER 
            while (len(inp_vector) < max_inp_num_sents):
                inp_vector.append([0] * (80))
            #'''

            q_vector = [
                utils.process_word(word=q[w],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index") for w in range(len(q))
            ]
            '''
            q_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec") for w in q]
                                        '''

            while (len(q_vector) < max_q_len):
                q_vector.append(0)

            inputs.append(inp_vector)
            questions.append(q_vector)

            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index"))

            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.float32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.float32))
            else:
                raise Exception("invalid input_mask_mode")
        '''#THIS TURN ON ONE HOT ENCODER
        #inputs = utils.one_hot_encoding(s, self.sent_vector_size, embedding_size)
        inputs = utils.one_hot_encoding_trip(inputs, vocab_size, max_inp_sent_len)
        questions = utils.one_hot_encoding_doub(questions, vocab_size, max_q_len)   
        #'''

        inputs = np.array(inputs).astype(floatX)
        questions = np.array(questions).astype(floatX)

        self.max_inp_sent_len = max_inp_sent_len
        self.max_q_len = max_q_len

        return inputs, questions, answers, input_masks
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []

        for x in data_raw:
            inp = x["C"].lower().split(" ")
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(" ")
            q = [w for w in q if len(w) > 0]

            inp_vector = [
                utils.process_word(
                    word=w,
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="word2vec",
                )
                for w in inp
            ]

            q_vector = [
                utils.process_word(
                    word=w,
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="word2vec",
                )
                for w in q
            ]

            if self.input_mask_mode == "word":
                input_mask = range(len(inp))
            elif self.input_mask_mode == "sentence":
                input_mask = [index for index, w in enumerate(inp) if w == "."]
            else:
                raise Exception("unknown input_mask_mode")
            fact_count = len(input_mask)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            answers.append(
                utils.process_word(
                    word=x["A"],
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="index",
                )
            )
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        return inputs, questions, answers, fact_counts, input_masks
    def _process_input(self, data_raw):
        inputs = []
        questions = []
        choices = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ') 
            inp = [w for w in inp if len(w) > 0]

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            
            pa = self._find_first(inp, 'a>')
            pb = self._find_first(inp, 'b>')
            pc = self._find_first(inp, 'c>')
            pd = self._find_first(inp, 'd>')
            
            assert (pa != -1 and pb != -1 and pc != -1 and pd != -1
                    and pa < pb and pb < pc and pc < pd)
            
            ca = inp[pa+1:pb]
            cb = inp[pb+1:pc]
            cc = inp[pc+1:pd]
            cd = inp[pd+1:]
            ca = ca[:self._find_first(ca, '.')+1]
            cb = cb[:self._find_first(cb, '.')+1]
            cc = cc[:self._find_first(cc, '.')+1]
            cd = cd[:self._find_first(cd, '.')+1]

            inp = inp[:pa]

            inp_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec",
                                        silent = True) for w in inp]
                                        
            q_vector = [utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec",
                                        silent = True) for w in q]

            choice_vectors = [np.array([utils.process_word(word = w, 
                                        word2vec = self.word2vec, 
                                        vocab = self.vocab, 
                                        ivocab = self.ivocab, 
                                        word_vector_size = self.word_vector_size, 
                                        to_return = "word2vec",
                                        silent = True) for w in choice], dtype=floatX)
                                                            for choice in [ca, cb, cc, cd]]
                                        
            
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(ord(x['A'][0]) - ord('A'))
            choices.append(choice_vectors)
            
            # TODO: here we assume the answer is one word! 
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) 
            elif self.input_mask_mode == 'sentence': 
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) 
            else:
                raise Exception("invalid input_mask_mode")
        
        return inputs, questions, answers, choices, input_masks
Beispiel #29
0
import os
from annotation import AnnotationDataset
from utils import create_folders, load_vocab, \
get_word_vectors, get_feat_vectors, process_word, process_tag
from wimp import WImpModel
from config import config

create_folders(config.model_out)

vocab_words = load_vocab(config.words_vocab_path)
feats = get_feat_vectors(config.feats_file)

word_processor = process_word(vocab_words)
tag_processor = process_tag()

dev = AnnotationDataset(config.dev_data, vocab_words, word_processor,
                        tag_processor)
test = AnnotationDataset(config.test_data, vocab_words, word_processor,
                         tag_processor)
train = AnnotationDataset(config.train_data, vocab_words, word_processor,
                          tag_processor)

model = WImpModel(config, vocab_words, feats)
model.setup()

model.train(train, dev)
model.evaluate(test)
model.interactive_shell(word_processor)
    def _process_input(self, data_raw):
        inputs = []
        questions = []
        choices = []
        answers = []
        input_masks = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]

            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            pa = self._find_first(inp, 'a>')
            pb = self._find_first(inp, 'b>')
            pc = self._find_first(inp, 'c>')
            pd = self._find_first(inp, 'd>')

            assert (pa != -1 and pb != -1 and pc != -1 and pd != -1 and pa < pb
                    and pb < pc and pc < pd)

            ca = inp[pa + 1:pb]
            cb = inp[pb + 1:pc]
            cc = inp[pc + 1:pd]
            cd = inp[pd + 1:]
            ca = ca[:self._find_first(ca, '.') + 1]
            cb = cb[:self._find_first(cb, '.') + 1]
            cc = cc[:self._find_first(cc, '.') + 1]
            cd = cd[:self._find_first(cd, '.') + 1]

            inp = inp[:pa]

            inp_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in inp
            ]

            q_vector = [
                utils.process_word(word=w,
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="word2vec",
                                   silent=True) for w in q
            ]

            choice_vectors = [
                np.array([
                    utils.process_word(word=w,
                                       word2vec=self.word2vec,
                                       vocab=self.vocab,
                                       ivocab=self.ivocab,
                                       word_vector_size=self.word_vector_size,
                                       to_return="word2vec",
                                       silent=True) for w in choice
                ],
                         dtype=floatX) for choice in [ca, cb, cc, cd]
            ]

            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(ord(x['A'][0]) - ord('A'))
            choices.append(choice_vectors)

            # TODO: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)],
                             dtype=np.int32))
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")

        return inputs, questions, answers, choices, input_masks
Beispiel #31
0
def process_input_2(data):
	questions1 = []
	answers1 = []
	truth1 = []
	questions0 = []
	answers0 = []
	truth0 = []
	questions = []
	answers = []
	truth = []

	for i in range(1,len(data)):
		lines = data[i]
		q = lines[1]
		a = lines[5]
		t = lines[6]
		if t == "0":
			truth0.append(0)
			q = q.strip()
			q = repl(q)
			q = q.lower().split(' ')
			q = [w for w in q if len(w) > 0]
			q_vector = [utils.process_word(word = w, 
		                                word2vec = word2vec, 
		                                vocab = vocab, 
		                                ivocab = ivocab, 
		                                word_vector_size = word_vector_size, 
		                                to_return = "word2vec",silent=True) for w in q]
			questions0.append(np.vstack(q_vector))
			a = a.strip()
			a = repl(a)
			a = a.lower().split(' ')
			a = [w for w in a if len(w) > 0]
			a_vector = [utils.process_word(word = w, 
		                                word2vec = word2vec, 
		                                vocab = vocab, 
		                                ivocab = ivocab, 
		                                word_vector_size = word_vector_size, 
		                                to_return = "word2vec",silent=True) for w in a]
			answers0.append(np.vstack(a_vector))
		if t == "1":
			truth1.append(1)
			q = q.strip()
			q = repl(q)
			q = q.lower().split(' ')
			q = [w for w in q if len(w) > 0]
			q_vector = [utils.process_word(word = w, 
		                                word2vec = word2vec, 
		                                vocab = vocab, 
		                                ivocab = ivocab, 
		                                word_vector_size = word_vector_size, 
		                                to_return = "word2vec",silent=True) for w in q]
			questions1.append(np.vstack(q_vector))
			a = a.strip()
			a = repl(a)
			a = a.lower().split(' ')
			a = [w for w in a if len(w) > 0]
			a_vector = [utils.process_word(word = w, 
		                                word2vec = word2vec, 
		                                vocab = vocab, 
		                                ivocab = ivocab, 
		                                word_vector_size = word_vector_size, 
		                                to_return = "word2vec",silent=True) for w in a]
			answers1.append(np.vstack(a_vector))
	questions1 = np.asarray(questions1)
	answers1 = np.asarray(answers1)
	truth1 = np.asarray(truth1)
	questions1,answers1,truth1 = shuffle(questions1,answers1,truth1)
	questions0 = np.asarray(questions0)
	answers0 = np.asarray(answers0)
	truth0 = np.asarray(truth0)
	questions0,answers0,truth0 = shuffle(questions0,answers0,truth0)
	for i in range(0,700):
		questions.append(questions1[i])
		answers.append(answers1[i])
		truth.append(truth1[i])
	for i in range(0,2100):
		questions.append(questions0[i])
		answers.append(answers0[i])
		truth.append(truth0[i])
	questions,answers,truth = shuffle(questions,answers,truth)
	questions,answers,truth = shuffle(questions,answers,truth)
	questions = np.asarray(questions)
	answers = np.asarray(answers)
	truth = np.asarray(truth)
	print "processing data done ! ********************************"
	return questions,answers,truth