def getVocab(self, vocab_path, max_document_length, filter_h_pad): if self.vocab_processor == None: print('locading vocab') vocab_processor = MyVocabularyProcessor(max_document_length - filter_h_pad, min_frequency=0) self.vocab_processor = vocab_processor.restore(vocab_path) return self.vocab_processor
def getAquaintTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getAquaintData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print (len(vocab_processor.vocabulary_)) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2, y
def getPCADataSet(self, data_path, vocab_path, max_document_length): x1_temp = self.getJsonPCAData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x1, np.ones(len(x1))
def getTestDataSet_infer(self, x1_infer, x2_infer, vocab_path, max_document_length): x1_temp,x2_temp = np.asarray(x1_infer), np.asarray(x2_infer) #, = self.getTsvTestData_infer(x1_infer, x2_infer) #print('DAS ist x1_temp: ', type(x1_temp), x1_temp) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print ('len vocab: ', len(vocab_processor.vocabulary_)) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2
def getTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getTsvTestData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2, y
def getTestDataSet(self, data_path, ent_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getTsvTestData(data_path) ent_x1,ent_x2=self.getEntData(x1_temp,x2_temp, ent_path, max_document_length) add_fea_test = self.getAdditionalFeature(x1_temp,x2_temp) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2,ent_x1,ent_x2, y, x1_temp, x2_temp,add_fea_test
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size, is_char_based, vocab_path): if is_char_based: x1_text, x2_text, y = self.getTsvDataCharBased(training_paths) else: x1_text, x2_text, y = self.getTsvData(training_paths) vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0, is_char_based=is_char_based) if os.path.exists(vocab_path): print("Loading vocab from: {}".format(vocab_path)) vocab_processor = vocab_processor.restore(vocab_path) else: # Build vocabulary print("Building vocabulary") vocab_processor.fit_transform(np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format(len(vocab_processor.vocabulary_))) # Write vocabulary vocab_processor.save(vocab_path) i1 = 0 train_set = [] dev_set = [] x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev))) sum_no_of_batches = len(y_train) // batch_size train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def getWords(self, word1, word2, vocab_path, max_document_length): temp1 = [] temp2 = [] temp1.append(word1.lower()) temp2.append(word2.lower()) x1_temp = np.asarray(temp1) x2_temp = np.asarray(temp2) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1, x2, np.asarray(-1)
def toVocabularyIndexVector(self, datax1, datax2, vocab_path, max_document_length): """ Transform the word list to vocabulary_index vectors :param datax1: :param datax2: :param vocab_path: :param max_document_length: :return: """ # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print(len(vocab_processor.vocabulary_)) datax1 = preprocess_arr(datax1) datax2 = preprocess_arr(datax2) x1 = np.asarray(list(vocab_processor.transform(datax1))) x2 = np.asarray(list(vocab_processor.transform(datax2))) # Randomly shuffle data del vocab_processor gc.collect() return x1, x2
class InputHelper(): def __init__(self, data_dir, input_file, batch_size, sequence_length, is_train=True): self.data_dir = data_dir self.batch_size = batch_size self.sequence_length = sequence_length vocab_file = os.path.join(data_dir, 'useWords.model.vec') # vocab_file = os.path.join(data_dir, 'vocab.pkl') input_file = os.path.join(data_dir, input_file) if not (os.path.exists(vocab_file)): print 'readling train file' self.preprocess(input_file, vocab_file) else: print 'loading vocab file' self.load_vocab(vocab_file) if is_train: self.create_batches(input_file) self.reset_batch() def preprocess(self, input_file, vocab_file, min_freq=2): token_freq = defaultdict(int) for line in open(input_file): seq1, seq2, label = line.rstrip().split('\t') seq = seq1 + ' ' + seq2 for token in seq.split(' '): token_freq[token] += 1 token_list = [ w for w in token_freq.keys() if token_freq[w] >= min_freq ] token_list.append('<pad>') token_dict = {token: index for index, token in enumerate(token_list)} with open(vocab_file, 'w') as f: cPickle.dump(token_dict, f) self.token_dictionary = token_dict self.vocab_size = len(self.token_dictionary) def load_vocab(self, vocab_file): self.token_dictionary = dict() for line in open(vocab_file): l = line.strip().split() st = l[0].decode('utf-8') self.token_dictionary[st] = np.asarray(l[1:]) # self.vocab_size = len(self.token_dictionary) # with open(vocab_file, 'rb') as f: # self.token_dictionary = cPickle.load(f) # self.vocab_size = len(self.token_dictionary) def text_to_array(self, text, is_clip=True): words = [w for w in jieba.cut(text[0]) if w.strip()] words1 = [ TOKENIZER_RE.findall(w)[0] for w in words if TOKENIZER_RE.findall(w) ] if is_clip: words1 = words1[:self.sequence_length] return words1 # seq_ids = [int(self.token_dictionary.get(token)) for token in text if # self.token_dictionary.get(token) is not None] # if is_clip: # seq_ids = seq_ids[:self.sequence_length] # return seq_ids def getTsvData(self, filepath): print("Loading training data from " + filepath) x1 = [] x2 = [] y = [] # positive samples from file for line in open(filepath): l = line.strip().split("\t") if len(l) < 3: continue if random() > 0.5: x1.append(l[1]) x2.append(l[2]) else: x1.append(l[2]) x2.append(l[1]) y.append(int(l[3])) return np.asarray(x1), np.asarray(x2), np.asarray(y) def getTsvTestData(self, filepath): print("Loading testing/labelled data from " + filepath) x1 = [] x2 = [] # positive samples from file for line in open(filepath): l = line.strip().split("\t") if len(l) < 2: continue x1.append(l[1]) x2.append(l[2]) return np.asarray(x1), np.asarray(x2) def getTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp, x2_temp = self.getTsvTestData(data_path) # Build vocabulary self.vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) self.vocab_processor = self.vocab_processor.restore(vocab_path) print len(self.vocab_processor.vocabulary_) x1 = np.asarray(list(self.vocab_processor.transform(x1_temp))) x2 = np.asarray(list(self.vocab_processor.transform(x2_temp))) # Randomly shuffle data return x1, x2 def batch_iter(self, data, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = np.asarray(data) print(data) print(data.shape) data_size = len(data) num_batches_per_epoch = int(len(data) / batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index] def padding_seq(self, seq_array, padding_index): for i in xrange(len(seq_array), self.sequence_length): seq_array.append(padding_index) def create_batches(self, text_file): x1 = [] x2 = [] y = [] seq1_array = [] seq2_array = [] # padding_index = self.vocab_size - 1 for line in open(text_file): _, seq1, seq2, label = line.rstrip().split('\t') # seq1_array = self.text_to_array(seq1.decode('utf-8').split(' ')) # seq2_array = self.text_to_array(seq2.decode('utf-8').split(' ')) # self.padding_seq(seq1_array, padding_index) # self.padding_seq(seq2_array, padding_index) label = int(label) x1.append(seq1) x2.append(seq2) y.append(label) self.vocab_processor = MyVocabularyProcessor(self.sequence_length, min_frequency=0) self.vocab_processor.fit_transform(np.concatenate((x2, x1), axis=0)) x1_1 = np.asarray(list(self.vocab_processor.transform(x1))) x2_1 = np.asarray(list(self.vocab_processor.transform(x2))) # x1 = np.array(x1) # x2 = np.array(x2) y = np.array(y) self.num_samples = len(y) self.num_batches = self.num_samples / self.batch_size indices = np.random.permutation(self.num_samples) self.x1 = x1_1[indices] self.x2 = x2_1[indices] self.y = y[indices] def next_batch(self): begin = self.pointer end = self.pointer + self.batch_size x1_batch = self.x1[begin:end] x2_batch = self.x2[begin:end] y_batch = self.y[begin:end] new_pointer = self.pointer + self.batch_size if new_pointer >= self.num_samples: self.eos = True else: self.pointer = new_pointer return x1_batch, x2_batch, y_batch def reset_batch(self): self.pointer = 0 self.eos = False
def getVocab(self,vocab_path, max_document_length,filter_h_pad): if self.vocab_processor==None: print('locading vocab') vocab_processor = MyVocabularyProcessor(max_document_length-filter_h_pad,min_frequency=0) self.vocab_processor = vocab_processor.restore(vocab_path) return self.vocab_processor
tf.flags.DEFINE_string("vocab_filepath", "runs/1541748108/checkpoints/vocab", "Load training time vocabulary (Default: None)") tf.flags.DEFINE_string("model", "runs/1541748108/checkpoints/model-33000", "Load trained model checkpoint (Default: None)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS # Build vocabulary vocab_processor = MyVocabularyProcessor(30, min_frequency=0) vocab_processor = vocab_processor.restore(FLAGS.vocab_filepath) def char2vec(arr): return np.asarray(list(vocab_processor.transform(arr))) def get_test_data_set(text): return char2vec(np.full([len(abbr_vec_arr)], text)) def batch_iter(data, batch_size): """ Generates a batch iterator for a dataset. """ data = np.asarray(data)