def _get_extended_questions(): with open('data/extend/extra_questions.txt', 'r', encoding='utf8') as f: raw = f.read().strip() question_frames = raw.split( "====================================================================================================" ) question_frames = [qf.strip() for qf in question_frames[:-1]] def process(question_frame): # return original question and its permutations lines = question_frame.split('\n') lines = [l.strip() for l in lines] if lines[0][:2] == "No": return None original = lines[0].strip("Permutations of '")[:-2] permutations = [l for l in lines[1:] if l] return original, permutations pre_process = PreProcess() question_dict = {} for qf in question_frames: tmp = process(qf) if tmp: o, p = process(qf) k = " ".join(pre_process.process(o, remove_stop_words=False)) question_dict[k] = [ " ".join(pre_process.process(i, remove_stop_words=False)) for i in p ] return question_dict
def pre_process(self): # pre process the data to fit into the algorithm if self.processor is not None: # if we already ran this, ask the user if he wants to run it again result = tkMessageBox.askquestion( message="pre-processing has already been made.\nare you sure you want to run it again?'", icon='warning', title=self.head_title) if result != 'yes': return self.processor = None self.is_pre_processed = False try: # verify the file can be pre-processed self.file_path = self.file_path_text.get() processor = PreProcess(self.file_path) if processor.verifications() is False: tkMessageBox.showerror(title=self.head_title, message=processor.error_message) return # process the data processor.pre_process() tkMessageBox.showinfo(title=self.head_title, message='Preprocessing completed successfully') self.processor = processor self.is_pre_processed = True except Exception as err: template = "An exception of type {0} occurred. Arguments:{1}" message = template.format(type(err).__name__, err) print_exc(err, file=stdout) tkMessageBox.showerror(title=self.head_title, message=message)
def run(self): """[read the arguments passed to check if is to train model, to run preprocess or run both] """ config_json = self._open_config() my_parser = argparse.ArgumentParser( description='Model to classify if is speech or music') my_parser.add_argument('-m', '--model', required=False, action='store_true') my_parser.add_argument('-d', '--data', required=False, action='store_true') my_parser.add_argument('-y', '--youtube', required=False, action='store_true') args = my_parser.parse_args() if (args.data): preProcess = PreProcess(config_json) preProcess.run() if (args.model): mlpClassifier = MlpClassifier(config_json) mlpClassifier.run() if (args.youtube): preProcessYoutube = PreProcessYoutube(config_json) preProcessYoutube.run()
def prepare_dataset(self): dat_obj = PreProcess() dat_obj.prepare_dataset() test_df = dat_obj.test_df2 test_dataset = SentimentDataset(test_df, max_length=100, mode='test') self.test_loader = DataLoader(test_dataset, batch_size=1, num_workers=0, shuffle=True)
def prepare_dataset(self): dat_obj = PreProcess() dat_obj.prepare_dataset() train_df = dat_obj.train_df val_df = dat_obj.val_df test_df = dat_obj.test_df1 train_dataset = SentimentDataset(train_df, max_length=100) val_dataset = SentimentDataset(val_df, max_length=100) test_dataset = SentimentDataset(test_df, max_length=100, mode='test') self.train_loader = DataLoader(train_dataset, batch_size=32, num_workers=0, shuffle=True) self.val_loader = DataLoader(val_dataset, batch_size=32, num_workers=0, shuffle=True) self.test_loader = DataLoader(test_dataset, batch_size=32, num_workers=0, shuffle=True)
def __init__(self, model_name, dataset): self.model_name = TRAINED_MODELS + model_name + "/" self.dataset = dataset self.data = Dataset(self.dataset) self.data.tfidf_compressor.train() self.model = self._load_model() self.pre_process = PreProcess() idx = list(self.data.train_data.keys()) idx.sort() self.train_c_word_set, self.train_c = self.data.get_all_c_word_set( self.data.train_data) self.all_train_contexts = np.array( [self.data.train_data[i]['context'] for i in idx]) self.related_questions = np.array( [self.data.train_data[i]['qs'] for i in idx])
def _convert_data(self, data_obj): pre_process = PreProcess() train_data = {} dev_data = {} idx = 0 for d in data_obj: # custom pre-process d['answer'] = d['answer'].strip("Answer:") context = " ".join(pre_process.process(d['answer'], url_norm=True)) if not context: continue original_question = " ".join( pre_process.process(d['question'], remove_stop_words=False)) extended_questions = self.extend_question_dict.get( original_question, []) if extended_questions: # split train and dev by questions train_questions, dev_questions = train_test_split( extended_questions, test_size=0.1, random_state=42) train_data[idx] = { 'context': d['answer'], 'c': context, 'qs': [original_question] + train_questions } dev_data[idx] = { 'context': d['answer'], 'c': context, 'qs': dev_questions } else: train_data[idx] = { 'context': d['answer'], 'c': context, 'qs': [original_question] } idx += 1 return train_data, dev_data
def _convert_data(data_obj): pre_process = PreProcess() data = {} idx = 0 for d in data_obj: # custom pre-process d['answer'] = d['answer'].strip("Answer:") d['answer'] = re.sub(" ", " ", d['answer']) context = " ".join(pre_process.process(d['answer'], url_norm=True)) question = " ".join( pre_process.process(d['question'], remove_stop_words=False)) if not (d['answer'] and context and question): continue data[idx] = { 'context': d['answer'], 'c': context, 'qs': [question] } idx += 1 return data
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 连接MongoDB,读取待分类数据 corpus_collection = MongoClient( "mongodb://39.108.180.114:27017")["ennews"]["news"] reviews_cursor = corpus_collection.find(no_cursor_timeout=True) # 数据预处理 PreProcess(corpus_collection, reviews_cursor).data_filter() # 分类 classify = Classify(corpus_collection, reviews_cursor) classify.run() reviews_cursor.close()
def main(): preprocess = PreProcess() X_train = preprocess.X_train y_train = preprocess.y_train X_test = preprocess.X_test y_test = preprocess.y_test scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) test_data = scaler.transform(preprocess.test) rmse_val = [] for k in range(30): k = k + 1 model = neighbors.KNeighborsRegressor(n_neighbors=k) model.fit(X_train, y_train) #fit the model pred = model.predict(X_test) #make prediction on test set error = sqrt(mean_squared_error(y_test, pred)) #calculate rmse rmse_val.append(error) #store rmse values print('RMSE value for k= ', k, 'is:', error) curve = pd.DataFrame(rmse_val) #elbow curve curve.plot()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary_path = "models/dictionary.dict" corpus_path = "models/corpus.lda-c" lda_model_path = "models/lda_model.lda" # topics = ["World", "Sport", "Business", "Technology", "Lifestyle", "Health"] lda_num_topics = 6 # 连接MongoDB corpus_collection = MongoClient( "mongodb://39.108.180.114:27017")["ennews"]["news"] reviews_cursor = corpus_collection.find(no_cursor_timeout=True) # 数据预处理 PreProcess(corpus_collection, reviews_cursor).data_filter() # 建立字典 dictionary = Dictionary(reviews_cursor, dictionary_path).build() # Corpus建模 Corpus(reviews_cursor, dictionary, corpus_path).serialize() reviews_cursor.close() # LDA建模 lda_model = Train.run(lda_model_path, corpus_path, lda_num_topics, dictionary) # 输出主题 # make a copy of original stdout route stdout_backup = sys.stdout # define the log file that receives your log info log_file = open(".\lda_topics.log", "w") # redirect print output to log file sys.stdout = log_file lda_model.print_topics() log_file.close() # restore the output to initial pattern sys.stdout = stdout_backup
def main(): preprocess = PreProcess() lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] X_train = preprocess.X_train y_train = preprocess.y_train X_test = preprocess.X_test y_test = preprocess.y_test scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) test_data = scaler.transform(preprocess.test) print("") best_model = "" max_testing_score = 0.0 for learning_rate in lr_list: n_estimators = 800 max_depth = 3 gb_clf = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, min_samples_split=20, max_depth=max_depth, random_state=0) gb_clf.fit(X_train, y_train.ravel()) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format( gb_clf.score(X_train, y_train))) testing_score = gb_clf.score(X_test, y_test) print("Accuracy score (validation): {0:.3f}".format(testing_score)) if (testing_score > max_testing_score): best_model = gb_clf max_testing_score = testing_score prediction = best_model.predict(test_data) filename = "predicted_gb_" + str(n_estimators) + "_" + str( max_depth) + "_" + str(learning_rate) + ".csv" format_to_csv(preprocess.test_instance, prediction, filename) print("CSV created")
def _get_extended_questions(self): with open(DATA + self.dataset + "/extra_questions.txt", 'r', encoding='utf8') as f: raw = f.read().strip() question_frames = raw.split( "====================================================================================================" ) question_frames = [qf.strip() for qf in question_frames[:-1]] def process(question_frame): # return original question and its permutations lines = question_frame.split('\n') lines = [l.strip() for l in lines] if lines[0][:2] == "No": return None original = lines[0].strip("Permutations of '")[:-2] permutations = [l for l in lines[1:] if l] return original, permutations pre_process = PreProcess() question_dict = {} t = Timer() for qf in question_frames: tmp = process(qf) if tmp: t.start("", verbal=False) o, p = process(qf) k = " ".join(pre_process.process(o, remove_stop_words=False)) question_dict[k] = [ " ".join(pre_process.process(i, remove_stop_words=False)) for i in p ] # select the most diverse question set self.tf_idf.train([k] + question_dict[k]) del_num = len(question_dict[k]) // self.top_k if del_num == 0: t.remaining_time(t.stop(verbal=False), len(question_frames)) continue selected = [] while question_dict[k]: indices = self.tf_idf.distance(k, question_dict[k]) q = question_dict[k].pop(indices[0]) selected.append(q) if not question_dict[k]: break close_q = self.tf_idf.distance( q, question_dict[k])[::-1][:del_num] question_dict[k] = [ question_dict[k][i] for i in range(len(question_dict[k])) if i not in close_q ] question_dict[k] = selected t.remaining_time(t.stop(verbal=False), len(question_frames)) return question_dict
def run(self, num_kernels=[25, 25], kernel_sizes=[(11, 11), (5, 5)], batch_size=256, epochs=100000, optimizer='RMSprop'): optimizerData = {} optimizerData['learning_rate'] = 0.0005 / 2 optimizerData['rho'] = 0.9 optimizerData['epsilon'] = 1e-2 optimizerData['momentum'] = 0.9 print '... Loading data' # load in and process data if data_size == 'large': preProcess = PreProcess() data = preProcess.run() elif data_size == 'medium': preProcess = Medium() data = preProcess.run() elif data_size == 'small': preProcess = Small() data = preProcess.run() else: print 'data_size must be small, medium or large.' exit() train_set_x, train_set_y = data[0], data[3] valid_set_x, valid_set_y = data[1], data[4] test_set_x, test_set_y = data[2], data[5] train_set_x = theano.tensor._shared(train_set_x, borrow=True) valid_set_x = theano.tensor._shared(valid_set_x, borrow=True) train_set_y = theano.tensor._shared(train_set_y, borrow=True) valid_set_y = theano.tensor._shared(valid_set_y, borrow=True) test_set_x = theano.tensor._shared(test_set_x, borrow=True) test_set_y = theano.tensor._shared(test_set_y, borrow=True) print '... Initializing network' # training parameters self.n_sports = 500 # print error if batch size is to large if valid_set_y.get_value(borrow=True).size < batch_size: print 'Error: Batch size is larger than size of validation set.' # compute batch sizes for train/test/validation n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # symbolic variables x = T.matrix('x') # input image data y = T.ivector('y') # input label data self.model(batch_size, num_kernels, kernel_sizes, x, y) # Initialize parameters and functions cost = self.layer3.negative_log_likelihood(y) # Cost function params = self.params # List of parameters grads = T.grad(cost, params) # Gradient index = T.lscalar() # Index # Intialize optimizer updates = self.init_optimizer(optimizer, cost, params, optimizerData) # Train function train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # Validation function validate_model = theano.function( [index], self.layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # Test function test_model = theano.function( [index], self.layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) def solve(): costs = [] for i in xrange(n_train_batches): costs.append(train_model(i)) return costs def shuffle(train_set_x, train_set_y): #print train_set_x.get_value(borrow=True).shape[0] rand = np.random.permutation( range(train_set_x.get_value(borrow=True).shape[0])) train_set_x.set_value(train_set_x.get_value(borrow=True)[rand], borrow=True) train_set_y.set_value(train_set_y.get_value(borrow=True)[rand], borrow=True) return train_set_x, train_set_y, train_model # Solver try: print '... Solving' start_time = time.time() for epoch in range(epochs): t1 = time.time() train_set_x, train_set_y, train_model = shuffle( train_set_x, train_set_y) costs = solve() validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] t2 = time.time() print "Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins".format( epoch + 1, np.mean(costs), np.mean(validation_losses), (t2 - t1) / 60., (t2 - start_time) / 60.) # f = open('workfile' , 'r+') # f.write("Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.)) # f.close() with open("workfile_batch_c", "a") as myfile: myfile.write( "Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins \n" .format(epoch + 1, np.mean(costs), np.mean(validation_losses), (t2 - t1) / 60., (t2 - start_time) / 60.)) if epoch % 10 == 0: test_errors = [ test_model(i) for i in range(n_test_batches) ] print "test errors: {:.1%}".format(np.mean(test_errors)) with open("workfile_batch_c2", "a") as myfile: myfile.write("test errors: {:.1%}\n".format( np.mean(test_errors))) except KeyboardInterrupt: print '... Exiting solver' # Evaluate performance predict = theano.function( inputs=[index], outputs=self.layer3.prediction(), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size] }) test_errors = [test_model(i) for i in range(n_test_batches)] print "test errors: {:.1%}".format(np.mean(test_errors)) pred = [predict(i) for i in range(n_test_batches)] print pred[0].shape
def run(self, num_kernels = [125,125], kernel_sizes = [(11, 11), (5, 5)], batch_size = 50, epochs = 100000, optimizer = 'RMSprop'): optimizerData = {} optimizerData['learning_rate'] = 0.0005/2 optimizerData['rho'] = 0.9 optimizerData['epsilon'] = 1e-2 optimizerData['momentum'] = 0.9 print '... Loading data' # load in and process data if data_size == 'large': preProcess = PreProcess() data = preProcess.run() elif data_size == 'medium': preProcess = Medium() data = preProcess.run() elif data_size == 'small': preProcess = Small() data = preProcess.run() else: print 'data_size must be small, medium or large.' exit() train_set_x,train_set_y = data[0],data[3] valid_set_x,valid_set_y = data[1],data[4] test_set_x,test_set_y = data[2],data[5] train_set_x = theano.tensor._shared(train_set_x,borrow=True) valid_set_x = theano.tensor._shared(valid_set_x,borrow=True) train_set_y = theano.tensor._shared(train_set_y,borrow=True) valid_set_y = theano.tensor._shared(valid_set_y,borrow=True) test_set_x = theano.tensor._shared(test_set_x,borrow=True) test_set_y = theano.tensor._shared(test_set_y,borrow=True) print '... Initializing network' # training parameters self.n_sports = 500 # print error if batch size is to large if valid_set_y.get_value(borrow=True).size<batch_size: print 'Error: Batch size is larger than size of validation set.' # compute batch sizes for train/test/validation n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # symbolic variables x = T.matrix('x') # input image data y = T.ivector('y') # input label data self.model(batch_size, num_kernels, kernel_sizes, x, y) # Initialize parameters and functions cost = self.layer3.negative_log_likelihood(y) # Cost function params = self.params # List of parameters grads = T.grad(cost, params) # Gradient index = T.lscalar() # Index # Intialize optimizer updates = self.init_optimizer(optimizer, cost, params, optimizerData) # Train function train_model = theano.function( [index], cost, updates = updates, givens = { x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # Validation function validate_model = theano.function( [index], self.layer3.errors(y), givens = { x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # Test function test_model = theano.function( [index], self.layer3.errors(y), givens = { x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) predict = theano.function(inputs = [index], outputs = self.layer3.prediction(), givens = { x: test_set_x[index*batch_size: (index+1)*batch_size] } ) def solve(): costs = [] for i in xrange(n_train_batches): costs.append(train_model(i)) return costs def shuffle(train_set_x,train_set_y): #print train_set_x.get_value(borrow=True).shape[0] rand = np.random.permutation(range(train_set_x.get_value(borrow=True).shape[0])) train_set_x.set_value(train_set_x.get_value(borrow=True)[rand],borrow=True) train_set_y.set_value(train_set_y.get_value(borrow=True)[rand],borrow=True) return train_set_x,train_set_y,train_model print np.savetxt('y_vec_LARGE.txt',test_set_y.get_value(borrow=True),delimiter=',') length = test_set_y.get_value(borrow=True).shape[0] print length print 'saved' try: print '... Solving' start_time = time.time() for epoch in range(epochs): t1 = time.time() train_set_x,train_set_y,train_model = shuffle(train_set_x,train_set_y) costs = solve() validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] t2 = time.time() print "Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.) # f = open('workfile' , 'r+') # f.write("Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.)) # f.close() with open("workfile_BIG31", "a") as myfile: myfile.write("Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins \n".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.)) if epoch%1== 0: predictions = np.array([predict(i) for i in range(n_test_batches)]) print predictions[0].shape print predictions.shape predictions = predictions.reshape((length-length%batch_size),500) with file('workfile_BIG33', 'w') as outfile: # I'm writing a header here just for the sake of readability # Any line starting with "#" will be ignored by numpy.loadtxt outfile.write('# Array shape: {0}\n'.format(len(predictions))) # Iterating through a ndimensional array produces slices along # the last axis. This is equivalent to data[i,:,:] in this case for data_slice in predictions: # The formatting string indicates that I'm writing out # the values in left-justified columns 7 characters in width # with 2 decimal places. np.savetxt(outfile, data_slice, fmt='%-7.2f') # Writing out a break to indicate different slices... #outfile.write('\n') test_errors = [test_model(i) for i in range(n_test_batches)] print "test errors: {:.1%}".format(np.mean(test_errors)) with open("workfile_BIG32", "a") as myfile: myfile.write("test errors: {:.1%}\n".format(np.mean(test_errors))) except KeyboardInterrupt: print '... Exiting solver' # Evaluate performance test_errors = [test_model(i) for i in range(n_test_batches)] print "test errors: {:.1%}".format(np.mean(test_errors)) pred = [predict(i) for i in range(n_test_batches)] print pred[0].shape
import tensorflow as tf from pre_process import PreProcess from tensorflow.keras.optimizers import Adam from tensorflow import keras import time import os from model.seq2seq_attention import encoder_model, decoder_model from tensorflow.keras.layers import Input encoder_weights_path = 'models/encoder.h5' decoder_weights_path = 'models/decoder.h5' process = PreProcess('./data/qingyun.tsv', samples_num=3000) samples_num = process.length # define params batch_size = 64 embedding_dim = 50 units = 256 steps_per_epoch = samples_num // batch_size encoder_input = Input((process.q_lenght, )) encoder = encoder_model(encoder_input, process.q_vocab_size, embedding_dim, units) decoder_input, hidden_input, encoder_output_input = Input((1, )), Input( (units, )), Input((process.q_lenght, units)) decoder = decoder_model(decoder_input, hidden_input, encoder_output_input, process.a_vocab_size, embedding_dim, units) if os.path.exists(encoder_weights_path):
from multiprocessing import cpu_count from pre_process import PreProcess from model_manager import ModelManager if __name__ == '__main__': # Load parameters with open(".\\data\\config.yaml") as config_file: config = load(config_file, Loader=FullLoader) # Define locals datasets: dict = dict() batch_data: dict = dict() num_features: int = len(config["COLUMNS"]["CATEGORICAL"]["NUMERIC"]) +len(config["COLUMNS"]["CATEGORICAL"]["STRING"]) +len(config["COLUMNS"]["CONTINUOUS"]) + 1 # 1 for the "Id" column pre_proc: PreProcess = PreProcess(columns=config["COLUMNS"], num_workers=cpu_count()) # Path checks if Path(config["PATH"]["PROCESSED"]["TRAIN"]).is_file() and\ Path(config["PATH"]["PROCESSED"]["TEST"]).is_file(): # Load processed data print("Pre-Processed data exists!\nLoading data...") datasets["train"] = pre_proc.load_data( path=config["PATH"]["PROCESSED"]["TRAIN"]) datasets["test"] = pre_proc.load_data( path=config["PATH"]["PROCESSED"]["TEST"]) print("Data loaded!") else: # Load raw data
def __getitem__(self, index): row = self.df.iloc[index] text, label = row['pre_process'], row[0] if label != 0: label = 1 out_dict = self.tokenizer.encode_plus(text=text, padding='max_length', max_length=200, return_tensors='pt') # print(out_dict) if self.mode != 'test': return [(out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label] else: return [text, (out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label] def __len__(self): # return int(self.df.shape[0]) return 2000 if __name__ == '__main__': from pre_process import PreProcess dat_obj = PreProcess() dat_obj.prepare_dataset() train_df = dat_obj.train_df dataset = SentimentDataset(train_df, 200) train_loader = DataLoader(dataset, batch_size=5,num_workers=8) for i, j in enumerate(train_loader,0): print(i) print(j[0][0])
class Inference: def __init__(self, model_name, dataset): self.model_name = TRAINED_MODELS + model_name + "/" self.dataset = dataset self.data = Dataset(self.dataset) self.data.tfidf_compressor.train() self.model = self._load_model() self.pre_process = PreProcess() idx = list(self.data.train_data.keys()) idx.sort() self.train_c_word_set, self.train_c = self.data.get_all_c_word_set( self.data.train_data) self.all_train_contexts = np.array( [self.data.train_data[i]['context'] for i in idx]) self.related_questions = np.array( [self.data.train_data[i]['qs'] for i in idx]) def _load_model(self): # load model num_chars = self.data.get_num_chars() embeddings = get_trimmed_embeddings(DATA + "embedding_data.npz") model = NtuModel(model_name=self.model_name, embeddings=embeddings, num_chars=num_chars, batch_size=32, early_stopping=False, k_neg=0) model.build() saver = tf.train.Saver() saver.restore(model.sess, tf.train.latest_checkpoint(self.model_name)) return model def get_answer(self, question): question_example = self.pre_process.process(question, remove_stop_words=False) q_word_set = set(question_example) question_example = self.data.process_sent(" ".join(question_example)) filtered_idx = [] for i in range(len(self.train_c_word_set)): if len(q_word_set.intersection(self.train_c_word_set[i])) > 0: filtered_idx.append(i) context_examples = [ self.data.process_sent(self.data.tfidf_compressor.compress(c)) for c in self.train_c[filtered_idx] ] scores = self.model.get_scores(question_example, context_examples) c_max = scores.argsort()[::-1][:10] if len(c_max) == 0: return "There is no answer for that.", ["None"] top_related_questions = self.related_questions[filtered_idx][c_max] top_original_context = self.all_train_contexts[filtered_idx][c_max] # process top related questions related_question_examples = [ self.data.process_sent(i[0]) for i in top_related_questions ] q_closet = self._arg_closest_related_questions( question_example, related_question_examples) return top_original_context[q_closet], top_related_questions[q_closet] def _arg_closest_related_questions(self, question, related_questions): all_question = [question] + related_questions q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in all_question]) padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids, pad_tok=0) padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids, pad_tok=0, nlevels=2) feed_dict = { self.model.q_word_ids: padded_q_word_ids, self.model.q_char_ids: padded_q_char_ids, self.model.q_sequence_lengths: q_sequence_lengths, self.model.q_word_lengths: q_word_lengths, self.model.keep_op: 1.0, self.model.is_training: False } question_embeddings = self.model.sess.run(self.model.q_dense, feed_dict=feed_dict) q = question_embeddings[0] # 1, 300 rq = question_embeddings[1:] scores = np.sum(np.square(rq - q), axis=-1) q_min = scores.argsort()[0] return q_min
def run(self, num_kernels=[15, 15], kernel_sizes=[(11, 11), (5, 5)], batch_size=50, epochs=20, optimizer='RMSprop'): optimizerData = {} optimizerData['learning_rate'] = 0.001 optimizerData['rho'] = 0.9 optimizerData['epsilon'] = 1e-4 optimizerData['momentum'] = 0.9 print '... Loading data' # load in and process data preProcess = PreProcess() data = preProcess.run() train_set_x, train_set_y = data[0], data[3] valid_set_x, valid_set_y = data[1], data[4] test_set_x, test_set_y = data[2], data[5] print train_set_x.eval().shape print '... Initializing network' # training parameters self.n_sports = np.max(train_set_y.eval()) + 1 # print error if batch size is to large if valid_set_y.eval().size < batch_size: print 'Error: Batch size is larger than size of validation set.' # compute batch sizes for train/test/validation n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_test_batches /= batch_size n_valid_batches /= batch_size # symbolic variables x = T.matrix('x') # input image data y = T.ivector('y') # input label data self.model(batch_size, num_kernels, kernel_sizes, x, y) # Initialize parameters and functions cost = self.layer3.negative_log_likelihood(y) # Cost function params = self.params # List of parameters grads = T.grad(cost, params) # Gradient index = T.lscalar() # Index # Intialize optimizer updates = self.init_optimizer(optimizer, cost, params, optimizerData) # Training model train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # Validation function validate_model = theano.function( [index], self.layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # Test function test_model = theano.function( [index], self.layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) def solve(): costs = [] for i in xrange(n_train_batches): costs.append(train_model(i)) #if i % 1000 ==0: # print i return costs # Solver try: print '... Solving' start_time = time.time() for epoch in range(epochs): t1 = time.time() costs = solve() validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] t2 = time.time() print "Epoch {} NLL {:.2} %err in validation set {:.1%} Time (epoch/total) {:.2}/{:.2} mins".format( epoch + 1, np.mean(costs), np.mean(validation_losses), (t2 - t1) / 60., (t2 - start_time) / 60.) except KeyboardInterrupt: print '... Exiting solver' # Evaluate performance test_errors = [test_model(i) for i in range(n_test_batches)] print "test errors: {:.1%}".format(np.mean(test_errors))
from model.seq2seq_attention import encoder_model, decoder_model, inference from pre_process import PreProcess from tensorflow.keras.layers import Input import argparse import os process = PreProcess('./data/qingyun.tsv') # define params embedding_dim = 50 units = 256 def _main(args): sentence = args.sentence encoder_weights_path = args.encoder_weights_path decoder_weights_path = args.decoder_weights_path if not os.path.exists(encoder_weights_path) or not os.path.exists(decoder_weights_path): raise ValueError('weights path should exists') # get model encoder_input = Input((process.q_lenght,)) encoder = encoder_model(encoder_input, process.q_vocab_size, embedding_dim, units) decoder_input, hidden_input, encoder_output_input = Input((1,)), Input((units,)), Input((process.q_lenght, units)) decoder = decoder_model(decoder_input, hidden_input, encoder_output_input, process.a_vocab_size, embedding_dim, units) # load model weights encoder.load_weights(encoder_weights_path, by_name=True) decoder.load_weights(decoder_weights_path, by_name=True) result, sentence = inference(process, encoder, decoder, sentence) print(sentence + '-->' + result.replace(' ', ''))