def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ tweets = X[0, :] snippets = X[1, :] targets = X[2, :] print("tweets's shape = ", tweets.shape) print("snippets's shape = ", snippets.shape) print("targets's shape = ", targets.shape) print("Y's shape = ", Y.shape) #model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output]) history = model.fit( [tweets, snippets, targets], Y, validation_data=([X_val[0, :], X_val[1, :], X_val[2, :]], Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) predictions = model.predict([tweets, snippets, targets]) #print(predictions.shape) #print(predictions) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] #print("tweets.shape =", tweets.shape) #print("snippets.shape =", snippets.shape) #print("targets.shape =", targets.shape) predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #print(predictions) #print(Y.shape) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("test data mse by keras = %f" % scores[1]) print("test data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("test data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("test data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) #print("test data scores[1](loss = mse) = %f" % scores[1]) #raise Exception ('Implement your testing function') (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("train data mse by keras = %f" % scores[1]) print("train data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("train data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("train data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #处理数据 #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) #初始化模型 # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') #训练过程 # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') #创建一个实例history history = LossHistory() hist = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) #绘制acc-loss曲线 history.loss_plot('epoch') #测试过程 # testing elif args.action == 'test': id = dm.data['test_data'][1] out = model.predict(dm.data['test_data'][0]) out = np.squeeze(out) out[out <= 0.5] = 0 out[out > 0.5] = 1 out = out.astype(int) print("pred shape:", np.array(out).shape) print("id shape:", np.array(id).shape) result = pd.concat( [pd.DataFrame({'id': id}), pd.DataFrame({'sentiment': out})], axis=1) wd = pd.DataFrame(result) wd.to_csv("submission.csv", index=None) newZip = zipfile.ZipFile('submission.zip', 'w') newZip.write('submission.csv', compress_type=zipfile.ZIP_DEFLATED) newZip.close() #半监督训练过 # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) history = LossHistory() # train hist = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) history.loss_plot('epoch') if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): parser = argparse.ArgumentParser(description='Text OHCA recognition') parser.add_argument('model') parser.add_argument('action', choices=['train', 'test']) # training argument parser.add_argument('--batch_size', default=256, type=float) parser.add_argument('--nb_epoch', default=2000, type=int) parser.add_argument('--val_ratio', default=0.1, type=float) parser.add_argument('--gpu_fraction', default=0.6, type=float) parser.add_argument('--vocab_size', default=50000, type=int) parser.add_argument('--max_length', default=400, type=int) parser.add_argument('--patience', default=30, type=int) # model parameter parser.add_argument('--loss_function', default='binary_crossentropy') parser.add_argument('--cell', default='LSTM', choices=['LSTM', 'GRU']) parser.add_argument('-num_lay', '--num_layers', default=2, type=int) parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int) parser.add_argument('-hid_siz', '--hidden_size', default=400, type=int) parser.add_argument('--pretrain_emb', default=True, type=bool) parser.add_argument('--emb_matrix', default='cbowemb.npz') # parser.add_argument('--dropout_rate', default=0.3, type=float) parser.add_argument('--keep_prob', default=1.0, type=float) parser.add_argument('-lr', '--learning_rate', default=0.013, type=float) parser.add_argument('--threshold', default=0.5, type=float) # output path for your prediction parser.add_argument( '--result_path', default='result.csv', ) # put model in the same directory parser.add_argument('--load_model', default=None) parser.add_argument('--load_token', default=True, type=bool) parser.add_argument('--save_dir', default='model/') # log dir for tensorboard parser.add_argument('--log_dir', default='log_dir/') # testing output parser.add_argument('--testfile', default='data/ohca_scripts.txt') parser.add_argument('--testout', default='data/script_test.txt') args = parser.parse_args() train_path = 'data/ohca_scripts.txt' test_path = args.testfile save_path = 'token/' #load token path if args.load_token is not None: load_path = os.path.join(save_path) # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = get_session(args.gpu_fraction) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, with_label=True) else: dm.add_data('test_data', test_path, with_label=True) # now the test will have label # prepare tokenizer print('get Tokenizer...') if args.load_token is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # Create the graph object tf.reset_default_graph() # initial model print('initial model...') rnnmodel = simpleRNN(args) #print (model.summary()) with tf.name_scope('inputs'): #create placeholder for training (testing) data X_ = tf.placeholder(tf.int32, [None, args.max_length], name='X') y_ = tf.placeholder(tf.int32, [ args.batch_size, ], name='y_') keep_prob = tf.placeholder_with_default(1.0, shape=(), name="keep_prob") y_predict = rnnmodel.model(args, X_, keep_prob) #prepare for saving model to evaluate train_var = [X_, y_, keep_prob, y_predict] tf.add_to_collection('train_var', train_var[0]) tf.add_to_collection('train_var', train_var[1]) tf.add_to_collection('train_var', train_var[2]) tf.add_to_collection('train_var', train_var[3]) #loss (MSE) mse = rnnmodel.loss(y_, y_predict) #optimizers train_op = rnnmodel.optimizer(args, mse) #accuracy for validation accuracy = rnnmodel.accuracy(y_, y_predict) #initial state of LSTM init_state = rnnmodel.initial_state # merge the write out histogram plots (tensorboard) merged = tf.summary.merge_all() #check outputs of LSTM routputs = rnnmodel.outputs if args.load_model is not None: load_path = os.path.join(args.save_dir) if args.action == 'train': print('Warning : load a exist model variables and keep training') path = os.path.join(load_path, 'Sentimen_rnn_final') if os.path.exists(path + ".meta"): print('load model from %s' % path) #model.load_weights(path) change to tensorflow model else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') raise Exception('Not loading model for testing...') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) print("Shape of X is {}, and y is {}".format( np.array(X).shape, np.array(Y).shape)) elif args.action == 'test': (X, Y) = dm.get_labeldata('test_data') print("Load test data (shape {})".format(X.shape)) #raise Exception ('Implement your testing function') init = tf.global_variables_initializer() #prepare to save model save_vars = tf.trainable_variables() saver = tf.train.Saver(save_vars, max_to_keep=7, keep_checkpoint_every_n_hours=1) last_loss = 1000000.0 with tf.Session() as sess: init.run() #if pre-trained, load embedding matrix if (args.pretrain_emb == True): emb_npfn = save_path + args.emb_matrix emb_matrix = np.load(emb_npfn)['embed_m'] if (emb_matrix.shape[0] != args.vocab_size or emb_matrix.shape[1] != args.embedding_dim): print( "Import embedding matrix shape {} does not match shape of ({},{})..." .format(emb_matrix.shape, args.vocab_size, args.embedding_dim)) exit(1) else: print("Loading embedding matrix.....") sess.run(rnnmodel.embedding_mat.assign(emb_matrix)) train_writer = tf.summary.FileWriter(args.log_dir + 'train', sess.graph) valid_writer = tf.summary.FileWriter(args.log_dir + 'valid', sess.graph) # load variables in graphs if assigned if args.load_model is not None: saver.restore(sess, path) #if semi-learning, first apply model to semi-learning data if (args.action == 'train'): #training early_stop_counter = 0 generation_num = 0 # repeat nb_epoch times for e in range(args.nb_epoch): state = sess.run([init_state]) semi_preds = [] if (e == 0): # hard copy X_train = X.copy() Y_train = Y.copy() #elif ( args.action='train'): #reset initial LSTM state every epochs n_batches = len(X) // args.batch_size for ix, (X_batch, y_batch) in enumerate( get_batches(X_train, Y_train, args.batch_size), 1): generation_num += 1 train_dict = { X_: X_batch, y_: y_batch, keep_prob: args.keep_prob, init_state: state } #for each traing generation, reload zero initial states _, summary, mse_train, accu_train = sess.run( [train_op, merged, mse, accuracy], feed_dict=train_dict) train_writer.add_summary(summary, generation_num) outputs_ = routputs.eval(feed_dict=train_dict) if (ix == 1): print(X_batch.shape) #print("shape of outputs is {}".format(outputs_[:,-1].shape)) if (generation_num % 10 == 0): print("Epoch: {}/{}".format(e, args.nb_epoch), "Iteration: {}".format(generation_num), "Train loss: {:.3f}".format(mse_train)) #validation for each 50 generations or end of each epoch if (generation_num % 50 == 0 or ix == n_batches): val_acc = [] val_loss = [] val_state = sess.run([init_state]) for iv, (X_batch, y_batch) in enumerate( get_batches(X_val, Y_val, args.batch_size), 1): val_dict = { X_: X_batch, y_: y_batch, keep_prob: 1, init_state: val_state } summary, batch_acc, batch_loss = sess.run( [merged, accuracy, mse], feed_dict=val_dict) #print out some answer for checking val_predict = sess.run(y_predict, feed_dict=val_dict) #print("shape of val_predict is {}".format(np.array(val_predict).shape)) #last ten elements of each batch for y_true, y_pre in zip(y_batch[-9:], val_predict[-9:]): print("y_true: {}, y_predict: {}".format( y_true, y_pre)) val_loss.append(batch_loss) val_acc.append(batch_acc) sys.stdout.flush() print("Iteration: {}".format(generation_num), "Val acc: {:.3f}".format(np.mean(val_acc)), "Val mse: {:.3f}".format(np.mean(val_loss))) valid_writer.add_summary(summary, generation_num) loss_val_avg = np.mean(val_loss) #save variables every 50 generations saver.save(sess, os.path.join(args.save_dir, "Sentimen_rnn"), global_step=generation_num) if (ix == n_batches): #early stop count here if (last_loss > loss_val_avg): last_loss = loss_val_avg early_stop_counter = 0 else: early_stop_counter += 1 if (early_stop_counter >= args.patience or e == (args.nb_epoch - 1)): #save model saver.save( sess, os.path.join(args.save_dir, "Sentimen_rnn_final")) saver.export_meta_graph(os.path.join( args.save_dir, "Sentimen_rnn_final.meta"), collection_list=['train_var']) break print("End of training.....") #testing elif (args.action == 'test'): # hard copy X_test = X.copy() Y_test = Y.copy() state = sess.run([init_state]) with open(args.testout, 'w+') as outfile: for ix, (X_batch, y_batch) in enumerate( get_batches(X_test, Y_test, args.batch_size), 1): test_dict = { X_: X_batch, y_: y_batch, keep_prob: args.keep_prob, init_state: state } #for each traing generation, reload zero initial states _, y_prebatch, accu_train = sess.run( [train_op, y_predict, accuracy], feed_dict=test_dict) for y_true, y_pre in zip(y_batch, y_prebatch): strout = "%d\t%f\n" % (y_true, y_pre) outfile.write(strout) print("Testing finish, write out file {}".format(args.testout)) #raise Exception ('Implement your testing function') return
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # prepare glove embedding embedding_matrix = preEB(dm) # initial model print('initial model...') model = simpleRNN(args, embedding_matrix, dm.tokenizer.word_index) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) pred = model.predict(X) scores = model.evaluate(X, Y) print("test data scores(loss = mse) = %f" % scores[1]) print("mse: ", evaluation(pred, Y, 'mse')) print("micro: ", evaluation(pred, Y, 'f1_micro')) print("macro: ", evaluation(pred, Y, 'f1_macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def cbow_main(): parser = argparse.ArgumentParser(description='CBOW word embedding') #training argument parser.add_argument('--vocab_size', default=50000, type=int) parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int) parser.add_argument('--gpu_fraction', default=0.8, type=float) parser.add_argument('--skip_window', default=2, type=int) parser.add_argument('--num_skips', default=4, type=int) parser.add_argument('--batch_size', default=512, type=int) parser.add_argument('--learning_rate', default=0.01, type=float) parser.add_argument('--log_dir', default='log_embdir/') parser.add_argument('--nsteps', default=5000000, type=int) # put model in the same directory parser.add_argument('--load_model', default=None) parser.add_argument('--load_token', default=None, type=bool) parser.add_argument('--save_embed', default='cbowemb.npz') args = parser.parse_args() mlclass_path = 'data/all_sents.txt' script_path = 'data/simu_script.txt' pylady_path = 'data/corpusclean_news_pylady.txt' pttgossi_path = 'data/ptt_gossiping_201611_post_cleanf.csv' #semi_path = 'data/training_nolabel.txt' save_path = 'token/' #load token path if args.load_token is not None: load_path = os.path.join(save_path) # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = get_session(args.gpu_fraction) #read all data for tokenizer (train, semi, test) dm = DataManager() print('Loading training data...') dm.add_data('ml_data', mlclass_path, False, False) dm.add_data('script_data', script_path, False, False) dm.add_data('pylady_data', pylady_path, False, False) dm.add_data('pttgossi_data', pttgossi_path, False, False) # prepare tokenizer print('get Tokenizer...') if args.load_token is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # prepare sequence to text dict reverse_word_dict = dict(map(reversed, dm.tokenizer.word_index.items())) # CBOW embedding [skip_window target skip_window] # context_size = args.skip_window*2 # convert to sequences without pre-padding (list, not np.array) #dm.to_sequence(args.max_length) dm.to_sequence_nopad() # fill all sequence data into a list seq_data = [] seq_data.extend(dm.get_data('ml_data')[0]) seq_data.extend(dm.get_data('script_data')[0]) seq_data.extend(dm.get_data('pylady_data')[0]) seq_data.extend(dm.get_data('pttgossi_data')[0]) #seq_data.extend(dm.get_data('test_data')[0]) # Create the graph object tf.reset_default_graph() # pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. #valid_examples = np.random.choice(valid_window, valid_size, replace=False) valid_text = [ "喘", "呼吸", "白沫", "沒有", "意識", "倒下", "電話", "臉色", "起伏", "睡著", "昏倒", "溺水", "清醒", "不", "微弱", "很" ] #print(dm.tokenizer.texts_to_sequences(valid_text)) valid_examples = np.array([ words[0] for words in (dm.tokenizer.texts_to_sequences(valid_text)) if len(words) > 0 ]) #print(valid_examples) #valid_examples = np.array(random.sample(range(valid_window), valid_size)) with tf.name_scope('inputs'): #create placeholder for training (testing) data X_ = tf.placeholder(tf.int32, [args.batch_size, args.num_skips], name='X_') y_ = tf.placeholder(tf.int32, [args.batch_size, 1], name='y_') valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #embedding here with tf.name_scope("embeddings"): embedding_mat = tf.get_variable('embedding_mat', [args.vocab_size, args.embedding_dim], tf.float32, tf.random_normal_initializer()) #embedding num_skips words embedding = tf.zeros([args.batch_size, args.embedding_dim]) for j in range(args.num_skips): embedding += tf.nn.embedding_lookup(embedding_mat, X_[:, j]) with tf.name_scope("softmax"): soft_weights = tf.get_variable('soft_weights', [args.vocab_size, args.embedding_dim], tf.float32, tf.random_normal_initializer()) soft_biases = tf.get_variable('soft_biases', [args.vocab_size], tf.float32, tf.constant_initializer(0.0)) num_sampled = 64 # Compute the loss with tf.name_scope('loss'): # tf.nn.nce_loss loss = tf.reduce_mean( tf.nn.nce_loss(weights=soft_weights, biases=soft_biases, labels=y_, inputs=embedding, num_sampled=num_sampled, num_classes=args.vocab_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) with tf.name_scope('optimizer'): optimizer = tf.train.AdagradOptimizer( args.learning_rate).minimize(loss) # Compute the similarity between minibatch examples and all embeddings norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_mat), 1, keep_dims=True)) #normalized embedding matrix by its summation of squre element value, then take squre root normalized_embeddings = embedding_mat / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) # Merge all summaries. merged = tf.summary.merge_all() # variable initializer init = tf.initialize_all_variables() #tensorflow model saver saver = tf.train.Saver(tf.global_variables()) writer = tf.summary.FileWriter(args.log_dir, sess.graph) average_loss = 0.0 data_index = 0 seq_index = 0 with tf.Session() as sess: # start to training sess.run(init) for step in range(args.nsteps): batch_X, batch_y, data_index, seq_index = generate_batch_cbow( seq_data, data_index, seq_index, args.batch_size, args.num_skips, args.skip_window) feed_dict = {X_: batch_X, y_: batch_y} op, lo = sess.run([optimizer, loss], feed_dict=feed_dict) average_loss += lo if (step % 2000 == 0): if (step > 0): average_loss = average_loss / 2000 # The average loss is an estimate of the loss over the last 2000 batches. print('Average loss at step %d: %f' % (step, average_loss)) average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps) if (step % 10000 == 0): sim = similarity.eval() for i in range(valid_size): try: valid_word = reverse_word_dict[valid_examples[i]] except KeyError: print("Skip word...") top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % valid_word for k in range(top_k): try: close_word = reverse_word_dict[nearest[k]] log = '%s %s,' % (log, close_word) except KeyError: print("Skip nearest {}-th word".format(k)) #print once for each word print(log) # final_embeddings = self.normalized_embeddings.eval() #final_embeddings = normalized_embeddings.eval() final_embeddings = embedding_mat.eval() # Save the model for checkpoints. saver.save(sess, os.path.join(args.log_dir, 'embmodel.ckpt')) writer.close() #save the embedding mapping matrix save_fn = save_path + args.save_embed np.savez(save_fn, embed_m=final_embeddings) return