def _process_input(data_raw, word2vec, vocab, ivocab, word_vector_size): sent_len = [] gate_len = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [utils.process_word(word = w.lower(), word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec") for w in inp] sent_len.append(len(inp_vector)) q_vector = [utils.process_word(word = w.lower(), word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec") for w in q] utils.process_word(word = x["A"].lower(), # TODO: add .lower() here word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "index") gate_len.append(len(x["S"])) return sent_len, gate_len
def _process_input(data_raw, word2vec, vocab, ivocab, word_vector_size): sent_len = [] gate_len = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w.lower(), word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec") for w in inp ] sent_len.append(len(inp_vector)) q_vector = [ utils.process_word(word=w.lower(), word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec") for w in q ] utils.process_word( word=x["A"].lower(), # TODO: add .lower() here word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="index") gate_len.append(len(x["S"])) return sent_len, gate_len
def process_input(data1): q = [] a = [] sent = [] for i in range(0,len(data1)): s1 = data1[i][0] s2 = data1[i][1] s1 = s1.strip() s1 = repl(s1) s1 = s1.lower().split(' ') s1 = [w for w in s1 if len(w) > 0] s1_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s1] s2 = s2.strip() s2 = repl(s2) s2 = s2.lower().split(' ') s2 = [w for w in s2 if len(w) > 0] s2_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s2] q.append(np.vstack(s1_vector)) a.append(np.vstack(s2_vector)) sent.append(data1[i][1]) q = np.asarray(q) a = np.asarray(a) sent = np.asarray(sent) return q,a,sent
def encode_document(doc, vocab_2_idx, sos='<sos>', eos='<eos>'): ''' Encodes a document (string) based on the given mapping (vocab_2_idx) Params: * doc : string, document to encode, it suppose that the token separator is a space * vocab_2_idx : dictionary, string to index * sos (optional) : string, Start Of Sentence token * eos (optional) : string, End Of Sentence token Returns: * doc_encoded : list of int ''' doc_encoded = [] for w in doc.split(' '): # handle trigrams encoded = u.process_word(w, vocab_2_idx) # handle bigrams for i, wp in enumerate(encoded): if not wp[1]: encoded[i] = u.process_word(wp[0], vocab_2_idx, n=2) encoded = u.flat_list(encoded) # handle unigrams encoded = [[(c, True) for c in wp[0]] if not wp[1] else wp for wp in encoded] encoded = u.flat_list(encoded) doc_encoded += [vocab_2_idx[wp] for wp, _ in encoded] doc_encoded.append(vocab_2_idx[' ']) doc_encoded = [vocab_2_idx[sos]] + doc_encoded[:-1] + [vocab_2_idx[eos]] return doc_encoded
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] gates = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] inputs.append( np.vstack(inp_vector).astype(floatX)) #(seq_len, vocab) questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word( word=x["A"], # TODO: add .lower() here word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': mask = [index for index, w in enumerate(inp)] elif self.input_mask_mode == 'sentence': mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("invalid input_mask_mode") mask.append(mask[-1] + 1) input_masks.append(np.array(mask, dtype=np.int32)) gate = [w for w in x["S"]] for i in xrange(len(gate), self.memory_hops): gate.append(len(mask) - 1) gates.append(np.array(gate, dtype=np.int32)) return inputs, questions, answers, input_masks, gates
def _process_input(data_raw): questions = [] inputs = [] answers = [] input_masks = [] supp_fact = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] sf = x["SF"] SF = [] for i in range(0,hops): if i<len(sf): SF.append(sf[i]) else: SF.append(sf[len(sf)-1]) inp_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec") for w in q] inputs.append(np.vstack(inp_vector)) questions.append(np.vstack(q_vector)) supp_fact.append(SF) answers.append(utils.process_word(word = x["A"], word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "index")) # NOTE: here we assume the answer is one word! if input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") inputs = np.asarray(inputs) questions = np.asarray(questions) answers = np.asarray(answers) input_masks = np.asarray(input_masks) supp_fact = np.asarray(supp_fact) return inputs, questions, answers, input_masks, supp_fact
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] img_features=[] for x in data_raw: #inp = x["C"].lower().split(' ') x["C"]=x["C"].lower() inp=re.split("[, \-!?:'\/]+",x["C"]) inp = [w for w in inp if len(w) > 0] x["Q"]=x["Q"].lower() q = re.split("[, \-!?:'\/]+",x["Q"]) q = [w for w in q if len(w) > 0] inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) elif (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("unknown input_mask_mode") fact_count = len(input_mask) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! #answers.append(utils.process_word(word = x["A"], # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "index")) answers.append(x["A"]) fact_counts.append(fact_count) input_masks.append(input_mask) img_features.append(x["I"]) return inputs, questions, answers, fact_counts, input_masks , img_features
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) elif (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("unknown input_mask_mode") fact_count = len(input_mask) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) fact_counts.append(fact_count) input_masks.append(input_mask) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word( word=x["A"], # TODO: add .lower() here word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. ''' inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec", silent=True) for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec", silent=True) for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data elif self.input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. ''' inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data elif self.input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, input_masks
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] gates = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) #(seq_len, vocab) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(utils.process_word(word = x["A"], # TODO: add .lower() here word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': mask = [index for index, w in enumerate(inp)] elif self.input_mask_mode == 'sentence': mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("invalid input_mask_mode") mask.append(mask[-1]+1) input_masks.append(np.array(mask, dtype=np.int32)) gate =[w for w in x["S"]] for i in xrange(len(gate),self.memory_hops): gate.append(len(mask)-1) gates.append(np.array(gate,dtype=np.int32)) return inputs, questions, answers, input_masks, gates
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) # print("Context\n ---------------------------------") # print( x["C"] ) # print("Question \n -------------------------------") # print( x["Q"] ) # print("Answer \n----------------------------------") # print( x["A"] ) # wait=raw_input("Press Enter To Continue") # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, input_masks
def process_input(data1): sents1 = [] sents2 = [] sim = [] for i in range(1, len(data1)): s1 = data1[i][0] s2 = data1[i][1] score = float(data1[i][2]) s1 = s1.strip() s1 = repl(s1) s1 = s1.lower().split(' ') s1 = [w for w in s1 if len(w) > 0] s1_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in s1 ] s2 = s2.strip() s2 = repl(s2) s2 = s2.lower().split(' ') s2 = [w for w in s2 if len(w) > 0] s2_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in s2 ] sents1.append(np.vstack(s1_vector)) sents2.append(np.vstack(s2_vector)) minn = 0.000001 p = np.array([minn, minn, minn, minn, minn]) if score == 5.0: p[4] = 1.0 - 4 * minn else: p[int(np.floor(score))] = score - np.floor(score) + minn p[int(np.floor(score)) - 1] = 1 + np.floor(score) - score + minn sim.append(p) sents1 = np.asarray(sents1) sents2 = np.asarray(sents2) sim = np.asarray(sim) return sents1, sents2, sim
def process_input_3(data1, data2): question = [] answer = [] for i in range(0, len(data1)): lines = data1[i] q = lines[0] q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in q ] for j in range(0, len(data2)): a = data2[j][0] a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a ] question.append(np.vstack(q_vector)) answer.append(np.vstack(a_vector)) question = np.asarray(question) answer = np.asarray(answer) print "processing data done ! ********************************" return question, answer
def process_input(data1): sents1 = [] sents2 = [] sim = [] for i in range(1,len(data1)): s1 = data1[i][0] s2 = data1[i][1] score = float(data1[i][2]) s1 = s1.strip() s1 = repl(s1) s1 = s1.lower().split(' ') s1 = [w for w in s1 if len(w) > 0] s1_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s1] s2 = s2.strip() s2 = repl(s2) s2 = s2.lower().split(' ') s2 = [w for w in s2 if len(w) > 0] s2_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s2] sents1.append(np.vstack(s1_vector)) sents2.append(np.vstack(s2_vector)) minn = 0.000001 p = np.array([minn,minn,minn,minn,minn]) if score == 5.0: p[4] = 1.0 - 4*minn else: p[int(np.floor(score))] = score - np.floor(score) + minn p[int(np.floor(score)) - 1] = 1 + np.floor(score) - score + minn sim.append(p) sents1 = np.asarray(sents1) sents2 = np.asarray(sents2) sim = np.asarray(sim) return sents1,sents2,sim
def process_input_3(data1,data2): question = [] answer = [] for i in range(0,len(data1)): lines = data1[i] q = lines[0] q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in q] for j in range(0,len(data2)): a = data2[j][0] a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a] question.append(np.vstack(q_vector)) answer.append(np.vstack(a_vector)) question = np.asarray(question) answer = np.asarray(answer) print "processing data done ! ********************************" return question,answer
def process_input(data1): sents1 = [] sents2 = [] sim = [] for i in range(1,len(data1)): s1 = data1[i][0] s2 = data1[i][1] score = float(data1[i][2]) s1 = s1.strip() s1 = repl(s1) s1 = s1.lower().split(' ') s1 = [w for w in s1 if len(w) > 0] s1_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s1] s2 = s2.strip() s2 = repl(s2) s2 = s2.lower().split(' ') s2 = [w for w in s2 if len(w) > 0] s2_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in s2] sents1.append(np.vstack(s1_vector)) sents2.append(np.vstack(s2_vector)) p = (score - 1)/4.0 sim.append(p) sents1 = np.asarray(sents1) sents2 = np.asarray(sents2) sim = np.asarray(sim) return sents1,sents2,sim
def process_input(data1, data2): question = [] answer = [] truth = [] cnt = 0 combined = zip(data1, data2) random.shuffle(combined) data1, data2 = zip(*combined) for i in range(0, 167): lines = data1[i] q = lines[0] a = lines[1] rand = random.sample(range(0, 260), 5) q = q.strip() q = repl(q) a = a.strip() a = repl(a) truth.append(1) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in q ] question.append(np.vstack(q_vector)) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a ] answer.append(np.vstack(a_vector)) for j in range(0, 5): a_rand = data2[rand[j]][0] a_rand = a_rand.strip() a_rand = repl(a_rand) a_rand = a_rand.lower().split(' ') a_rand = [w for w in a_rand if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a_rand ] if rand[j] == i: rand1 = random.sample(range(0, 260), 1) a_rand = data2[rand1[0]][0] a_rand = a_rand.strip() a_rand = repl(a_rand) a_rand = a_rand.lower().split(' ') a_rand = [w for w in a_rand if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a_rand ] if rand1[0] == i: cnt = cnt + 1 question.append(np.vstack(q_vector)) answer.append(np.vstack(a_vector)) truth.append(0) question = np.asarray(question) answer = np.asarray(answer) truth = np.asarray(truth) print "processing data done ! ********************************" return question, answer, truth
def process_input_2(data): questions1 = [] answers1 = [] truth1 = [] questions0 = [] answers0 = [] truth0 = [] questions = [] answers = [] truth = [] for i in range(1, len(data)): lines = data[i] q = lines[1] a = lines[5] t = lines[6] if t == "0": truth0.append(0) q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in q ] questions0.append(np.vstack(q_vector)) a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a ] answers0.append(np.vstack(a_vector)) if t == "1": truth1.append(1) q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in q ] questions1.append(np.vstack(q_vector)) a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [ utils.process_word(word=w, word2vec=word2vec, vocab=vocab, ivocab=ivocab, word_vector_size=word_vector_size, to_return="word2vec", silent=True) for w in a ] answers1.append(np.vstack(a_vector)) questions1 = np.asarray(questions1) answers1 = np.asarray(answers1) truth1 = np.asarray(truth1) questions1, answers1, truth1 = shuffle(questions1, answers1, truth1) questions0 = np.asarray(questions0) answers0 = np.asarray(answers0) truth0 = np.asarray(truth0) questions0, answers0, truth0 = shuffle(questions0, answers0, truth0) for i in range(0, 700): questions.append(questions1[i]) answers.append(answers1[i]) truth.append(truth1[i]) for i in range(0, 2100): questions.append(questions0[i]) answers.append(answers0[i]) truth.append(truth0[i]) questions, answers, truth = shuffle(questions, answers, truth) questions, answers, truth = shuffle(questions, answers, truth) questions = np.asarray(questions) answers = np.asarray(answers) truth = np.asarray(truth) print "processing data done ! ********************************" return questions, answers, truth
def _process_input(self, data_raw): max_inp_len = 0 max_q_len = 0 self.max_fact_count = 0 for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) elif (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) else: raise Exception("unknown input_mask_mode") max_inp_len = max(max_inp_len, len(inp)) max_q_len = max(max_q_len, len(q)) self.max_fact_count = max(self.max_fact_count, fact_count) questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) if (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in inp ] while (len(inp_vector) < max_inp_len): inp_vector.append(self._empty_word_vector()) q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec") for w in q ] while (len(q_vector) < max_q_len): q_vector.append(self._empty_word_vector()) if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) if (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] while (len(input_mask) < self.max_fact_count): input_mask.append(-1) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) fact_counts.append(fact_count) input_masks.append(input_mask) inputs = np.array(inputs).astype(floatX) questions = np.array(questions).astype(floatX) answers = np.array(answers).astype(np.int32) fact_counts = np.array(fact_counts).astype(np.int32) input_masks = np.array(input_masks).astype(np.int32) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): max_inp_sent_len = 0. max_inp_num_sents = 0. max_q_len = 0. self.max_fact_count = 0. for x in data_raw: #this splits it into sentences sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') x["C"] = sent_detector.tokenize(x["C"]) inp = [] for i in range(len(x["C"])): inp.append(x["C"][i].lower().split(' ')) inp[i] = [w for w in inp[i] if len(w) > 0] max_inp_sent_len = max(max_inp_sent_len, len(inp[i])) q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) elif (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) else: raise Exception("unknown input_mask_mode") max_inp_num_sents = max(max_inp_num_sents, len(inp)) max_q_len = max(max_q_len, len(q)) self.max_fact_count = max(self.max_fact_count, fact_count) questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') #x["C"] = sent_detector.tokenize(x["C"]) inp = [] for i in range(len(x["C"])): inp.append(x["C"][i].lower().split(' ')) inp[i] = [w for w in inp[i] if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [] for i in range(len(inp)): inp_i = [utils.process_word(word = inp[i][w], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index") for w in range(len(inp[i]))] inp_vector.append(inp_i) #is this still needed? while(len(inp_vector[i]) < max_inp_sent_len): inp_vector[i].append(0) #''' ''' #VERSION FOR IF YOU SCRAP SENTENCE ENCODER while(len(inp_vector[i]) < 80): inp_vector[i].append(0) #''' #''' #is this still needed? while (len(inp_vector) < max_inp_num_sents): inp_vector.append([0] * (max_inp_sent_len)) #''' ''' #VERSION FOR IF YOU SCRAP SENTENCE ENCODER while (len(inp_vector) < max_inp_num_sents): inp_vector.append([0] * (80)) #''' q_vector = [utils.process_word(word = q[w], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index") for w in range(len(q))] ''' q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] ''' while(len(q_vector) < max_q_len): q_vector.append(0) inputs.append(inp_vector) questions.append(q_vector) answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.float32)) elif self.input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.float32)) else: raise Exception("invalid input_mask_mode") '''#THIS TURN ON ONE HOT ENCODER #inputs = utils.one_hot_encoding(s, self.sent_vector_size, embedding_size) inputs = utils.one_hot_encoding_trip(inputs, vocab_size, max_inp_sent_len) questions = utils.one_hot_encoding_doub(questions, vocab_size, max_q_len) #''' inputs = np.array(inputs).astype(floatX) questions = np.array(questions).astype(floatX) self.max_inp_sent_len = max_inp_sent_len self.max_q_len = max_q_len return inputs, questions, answers, input_masks
def _process_input(self, data_raw): max_inp_len = 0 max_q_len = 0 self.max_fact_count = 0 for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) elif (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) else: raise Exception("unknown input_mask_mode") max_inp_len = max(max_inp_len, len(inp)) max_q_len = max(max_q_len, len(q)) self.max_fact_count = max(self.max_fact_count, fact_count) questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) if (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] while(len(inp_vector) < max_inp_len): inp_vector.append(self._empty_word_vector()) q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] while(len(q_vector) < max_q_len): q_vector.append(self._empty_word_vector()) if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) if (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] while(len(input_mask) < self.max_fact_count): input_mask.append(-1) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) fact_counts.append(fact_count) input_masks.append(input_mask) inputs = np.array(inputs).astype(floatX) questions = np.array(questions).astype(floatX) answers = np.array(answers).astype(np.int32) fact_counts = np.array(fact_counts).astype(np.int32) input_masks = np.array(input_masks).astype(np.int32) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): inputs = [] questions = [] choices = [] answers = [] input_masks = [] for x in data_raw: x["C"] += sequence_classifier.get_sequence(x["Q"].lower() + ' ' + x["A1"] + ' ' + x["A2"]) inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] #pa = self._find_first(inp, 'a>') #pb = self._find_first(inp, 'b>') #pc = self._find_first(inp, 'c>') #pd = self._find_first(inp, 'd>') #assert (pa != -1 and pb != -1 #and pc != -1 and pd != -1 # and pa < pb )#and pb < pc and pc < pd) ca = x["A1"].replace('.', '').lower().split(' ') cb = x["A2"].replace('.', '').lower().split(' ') ca = [w for w in ca if len(w) > 0] cb = [w for w in cb if len(w) > 0] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in q ] choice_vectors = [ np.array([ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in choice ], dtype=floatX) for choice in [ca, cb] ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(x['A']) choices.append(choice_vectors) # TODO: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, choices, input_masks
def process_input(data1,data2): question = [] answer = [] truth = [] cnt = 0 combined = zip(data1,data2) random.shuffle(combined) data1,data2 = zip(*combined) for i in range(0,167): lines = data1[i] q = lines[0] a = lines[1] rand = random.sample(range(0, 260), 5) q = q.strip() q = repl(q) a = a.strip() a = repl(a) truth.append(1) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in q] question.append(np.vstack(q_vector)) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a] answer.append(np.vstack(a_vector)) for j in range(0,5): a_rand = data2[rand[j]][0] a_rand = a_rand.strip() a_rand = repl(a_rand) a_rand = a_rand.lower().split(' ') a_rand = [w for w in a_rand if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a_rand] if rand[j]==i: rand1 = random.sample(range(0, 260), 1) a_rand = data2[rand1[0]][0] a_rand = a_rand.strip() a_rand = repl(a_rand) a_rand = a_rand.lower().split(' ') a_rand = [w for w in a_rand if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a_rand] if rand1[0] == i: cnt = cnt + 1 question.append(np.vstack(q_vector)) answer.append(np.vstack(a_vector)) truth.append(0) question = np.asarray(question) answer = np.asarray(answer) truth = np.asarray(truth) print "processing data done ! ********************************" return question,answer,truth
def _process_input(self, data_raw): max_inp_sent_len = 0. max_inp_num_sents = 0. max_q_len = 0. self.max_fact_count = 0. for x in data_raw: #this splits it into sentences sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') x["C"] = sent_detector.tokenize(x["C"]) inp = [] for i in range(len(x["C"])): inp.append(x["C"][i].lower().split(' ')) inp[i] = [w for w in inp[i] if len(w) > 0] max_inp_sent_len = max(max_inp_sent_len, len(inp[i])) q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] if (self.input_mask_mode == 'word'): fact_count = len(inp) elif (self.input_mask_mode == 'sentence'): fact_count = len([0 for w in inp if w == '.']) else: raise Exception("unknown input_mask_mode") max_inp_num_sents = max(max_inp_num_sents, len(inp)) max_q_len = max(max_q_len, len(q)) self.max_fact_count = max(self.max_fact_count, fact_count) questions = [] inputs = [] answers = [] input_masks = [] for x in data_raw: #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') #x["C"] = sent_detector.tokenize(x["C"]) inp = [] for i in range(len(x["C"])): inp.append(x["C"][i].lower().split(' ')) inp[i] = [w for w in inp[i] if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [] for i in range(len(inp)): inp_i = [ utils.process_word(word=inp[i][w], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index") for w in range(len(inp[i])) ] inp_vector.append(inp_i) #is this still needed? while (len(inp_vector[i]) < max_inp_sent_len): inp_vector[i].append(0) #''' ''' #VERSION FOR IF YOU SCRAP SENTENCE ENCODER while(len(inp_vector[i]) < 80): inp_vector[i].append(0) #''' #''' #is this still needed? while (len(inp_vector) < max_inp_num_sents): inp_vector.append([0] * (max_inp_sent_len)) #''' ''' #VERSION FOR IF YOU SCRAP SENTENCE ENCODER while (len(inp_vector) < max_inp_num_sents): inp_vector.append([0] * (80)) #''' q_vector = [ utils.process_word(word=q[w], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index") for w in range(len(q)) ] ''' q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] ''' while (len(q_vector) < max_q_len): q_vector.append(0) inputs.append(inp_vector) questions.append(q_vector) answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.float32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.float32)) else: raise Exception("invalid input_mask_mode") '''#THIS TURN ON ONE HOT ENCODER #inputs = utils.one_hot_encoding(s, self.sent_vector_size, embedding_size) inputs = utils.one_hot_encoding_trip(inputs, vocab_size, max_inp_sent_len) questions = utils.one_hot_encoding_doub(questions, vocab_size, max_q_len) #''' inputs = np.array(inputs).astype(floatX) questions = np.array(questions).astype(floatX) self.max_inp_sent_len = max_inp_sent_len self.max_q_len = max_q_len return inputs, questions, answers, input_masks
def _process_input(self, data_raw): questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(" ") inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(" ") q = [w for w in q if len(w) > 0] inp_vector = [ utils.process_word( word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", ) for w in inp ] q_vector = [ utils.process_word( word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", ) for w in q ] if self.input_mask_mode == "word": input_mask = range(len(inp)) elif self.input_mask_mode == "sentence": input_mask = [index for index, w in enumerate(inp) if w == "."] else: raise Exception("unknown input_mask_mode") fact_count = len(input_mask) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append( utils.process_word( word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index", ) ) fact_counts.append(fact_count) input_masks.append(input_mask) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): inputs = [] questions = [] choices = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] pa = self._find_first(inp, 'a>') pb = self._find_first(inp, 'b>') pc = self._find_first(inp, 'c>') pd = self._find_first(inp, 'd>') assert (pa != -1 and pb != -1 and pc != -1 and pd != -1 and pa < pb and pb < pc and pc < pd) ca = inp[pa+1:pb] cb = inp[pb+1:pc] cc = inp[pc+1:pd] cd = inp[pd+1:] ca = ca[:self._find_first(ca, '.')+1] cb = cb[:self._find_first(cb, '.')+1] cc = cc[:self._find_first(cc, '.')+1] cd = cd[:self._find_first(cd, '.')+1] inp = inp[:pa] inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec", silent = True) for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec", silent = True) for w in q] choice_vectors = [np.array([utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec", silent = True) for w in choice], dtype=floatX) for choice in [ca, cb, cc, cd]] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(ord(x['A'][0]) - ord('A')) choices.append(choice_vectors) # TODO: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append(np.array([index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, choices, input_masks
import os from annotation import AnnotationDataset from utils import create_folders, load_vocab, \ get_word_vectors, get_feat_vectors, process_word, process_tag from wimp import WImpModel from config import config create_folders(config.model_out) vocab_words = load_vocab(config.words_vocab_path) feats = get_feat_vectors(config.feats_file) word_processor = process_word(vocab_words) tag_processor = process_tag() dev = AnnotationDataset(config.dev_data, vocab_words, word_processor, tag_processor) test = AnnotationDataset(config.test_data, vocab_words, word_processor, tag_processor) train = AnnotationDataset(config.train_data, vocab_words, word_processor, tag_processor) model = WImpModel(config, vocab_words, feats) model.setup() model.train(train, dev) model.evaluate(test) model.interactive_shell(word_processor)
def _process_input(self, data_raw): inputs = [] questions = [] choices = [] answers = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] pa = self._find_first(inp, 'a>') pb = self._find_first(inp, 'b>') pc = self._find_first(inp, 'c>') pd = self._find_first(inp, 'd>') assert (pa != -1 and pb != -1 and pc != -1 and pd != -1 and pa < pb and pb < pc and pc < pd) ca = inp[pa + 1:pb] cb = inp[pb + 1:pc] cc = inp[pc + 1:pd] cd = inp[pd + 1:] ca = ca[:self._find_first(ca, '.') + 1] cb = cb[:self._find_first(cb, '.') + 1] cc = cc[:self._find_first(cc, '.') + 1] cd = cd[:self._find_first(cd, '.') + 1] inp = inp[:pa] inp_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in inp ] q_vector = [ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in q ] choice_vectors = [ np.array([ utils.process_word(word=w, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="word2vec", silent=True) for w in choice ], dtype=floatX) for choice in [ca, cb, cc, cd] ] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(ord(x['A'][0]) - ord('A')) choices.append(choice_vectors) # TODO: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32)) elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") return inputs, questions, answers, choices, input_masks
def process_input_2(data): questions1 = [] answers1 = [] truth1 = [] questions0 = [] answers0 = [] truth0 = [] questions = [] answers = [] truth = [] for i in range(1,len(data)): lines = data[i] q = lines[1] a = lines[5] t = lines[6] if t == "0": truth0.append(0) q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in q] questions0.append(np.vstack(q_vector)) a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a] answers0.append(np.vstack(a_vector)) if t == "1": truth1.append(1) q = q.strip() q = repl(q) q = q.lower().split(' ') q = [w for w in q if len(w) > 0] q_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in q] questions1.append(np.vstack(q_vector)) a = a.strip() a = repl(a) a = a.lower().split(' ') a = [w for w in a if len(w) > 0] a_vector = [utils.process_word(word = w, word2vec = word2vec, vocab = vocab, ivocab = ivocab, word_vector_size = word_vector_size, to_return = "word2vec",silent=True) for w in a] answers1.append(np.vstack(a_vector)) questions1 = np.asarray(questions1) answers1 = np.asarray(answers1) truth1 = np.asarray(truth1) questions1,answers1,truth1 = shuffle(questions1,answers1,truth1) questions0 = np.asarray(questions0) answers0 = np.asarray(answers0) truth0 = np.asarray(truth0) questions0,answers0,truth0 = shuffle(questions0,answers0,truth0) for i in range(0,700): questions.append(questions1[i]) answers.append(answers1[i]) truth.append(truth1[i]) for i in range(0,2100): questions.append(questions0[i]) answers.append(answers0[i]) truth.append(truth0[i]) questions,answers,truth = shuffle(questions,answers,truth) questions,answers,truth = shuffle(questions,answers,truth) questions = np.asarray(questions) answers = np.asarray(answers) truth = np.asarray(truth) print "processing data done ! ********************************" return questions,answers,truth