def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ tweets = X[0, :] snippets = X[1, :] targets = X[2, :] print("tweets's shape = ", tweets.shape) print("snippets's shape = ", snippets.shape) print("targets's shape = ", targets.shape) print("Y's shape = ", Y.shape) #model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output]) history = model.fit( [tweets, snippets, targets], Y, validation_data=([X_val[0, :], X_val[1, :], X_val[2, :]], Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) predictions = model.predict([tweets, snippets, targets]) #print(predictions.shape) #print(predictions) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] #print("tweets.shape =", tweets.shape) #print("snippets.shape =", snippets.shape) #print("targets.shape =", targets.shape) predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #print(predictions) #print(Y.shape) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("test data mse by keras = %f" % scores[1]) print("test data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("test data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("test data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) #print("test data scores[1](loss = mse) = %f" % scores[1]) #raise Exception ('Implement your testing function') (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("train data mse by keras = %f" % scores[1]) print("train data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("train data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("train data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #处理数据 #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) #初始化模型 # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') #训练过程 # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') #创建一个实例history history = LossHistory() hist = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) #绘制acc-loss曲线 history.loss_plot('epoch') #测试过程 # testing elif args.action == 'test': id = dm.data['test_data'][1] out = model.predict(dm.data['test_data'][0]) out = np.squeeze(out) out[out <= 0.5] = 0 out[out > 0.5] = 1 out = out.astype(int) print("pred shape:", np.array(out).shape) print("id shape:", np.array(id).shape) result = pd.concat( [pd.DataFrame({'id': id}), pd.DataFrame({'sentiment': out})], axis=1) wd = pd.DataFrame(result) wd.to_csv("submission.csv", index=None) newZip = zipfile.ZipFile('submission.zip', 'w') newZip.write('submission.csv', compress_type=zipfile.ZIP_DEFLATED) newZip.close() #半监督训练过 # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) history = LossHistory() # train hist = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) history.loss_plot('epoch') if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') """ # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) """ if not os.path.isdir(save_path): os.makedirs(save_path) """ if not os.path.exists(os.path.join(save_path,'token.pk')): dm.save_tokenizer(os.path.join(save_path,'token.pk')) """ # convert to sequences token_corpus = dm.to_token_corpus(args.max_length) #word2vec = to_word2vec(token_corpus) if args.action == "train": word2vec = to_word2vec(token_corpus) save_path_word2vec_model = os.path.join(save_path, 'word2vec.model') word2vec.save(save_path_word2vec_model) elif args.action == "test": path = os.path.join(load_path, 'word2vec.model') if os.path.exists(path): print('load model from %s' % path) word2vec = Word2Vec.load(path) else: raise ValueError("Can't find the file %s" % path) word2vec = word2vec.wv #print(word2vec['downgrades']) #padding sentence dm.padding_sent(args.max_length) dm.sent_to_word2vec(word2vec) #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) # initial model print('initial model...') model = simpleRNN(args) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #print(type(X)) #print(type(X[0])) #print(X[0][0]) #print(X) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') #X, Y, X_val, Y_val = np.array(X), np.array(Y), np.array(X_val), np.array(Y_val) #print(X) #print(X[0]) #X_val = np.reshape(X_val, (X_val.shape[0], args.max_length, X_val.shape[2])) save_path_model_h5 = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) model.save(save_path_model_h5) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) predictions = model.predict(X) predictions = predictions.reshape(-1) scores = model.evaluate(X, Y) print("test data mse by keras = %f" % scores[1]) print("test data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("test data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("test data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) predictions = model.predict(X) predictions = predictions.reshape(-1) scores = model.evaluate(X, Y) print("train data mse by keras = %f" % scores[1]) print("train data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("train data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("train data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) #raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # prepare glove embedding embedding_matrix = preEB(dm) # initial model print('initial model...') model = simpleRNN(args, embedding_matrix, dm.tokenizer.word_index) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) pred = model.predict(X) scores = model.evaluate(X, Y) print("test data scores(loss = mse) = %f" % scores[1]) print("mse: ", evaluation(pred, Y, 'mse')) print("micro: ", evaluation(pred, Y, 'f1_micro')) print("macro: ", evaluation(pred, Y, 'f1_macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): parser = argparse.ArgumentParser(description='Text OHCA recognition') parser.add_argument('--batch_size', default=1, type=float) # change to 1 for single setence parser.add_argument('--vocab_size', default=50000, type=int) # model parameter parser.add_argument('--loss_function', default='binary_crossentropy') parser.add_argument('--cell', default='LSTM', choices=['LSTM', 'GRU']) parser.add_argument('-num_lay', '--num_layers', default=2, type=int) parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int) parser.add_argument('-hid_siz', '--hidden_size', default=400, type=int) parser.add_argument('--pretrain_emb', default=True, type=bool) parser.add_argument('--emb_matrix', default='cbowemb.npz') parser.add_argument('--keep_prob', default=1.0, type=float) parser.add_argument('--max_length', default=400, type=int) parser.add_argument('--threshold', default=0.6, type=float) # output path for your prediction parser.add_argument( '--result_path', default='evalresult.txt', ) # input testing file name parser.add_argument('--test_file', default="data/ohca_test1.txt") # output testing result parser.add_argument('--outfile', default="data/ohca_testout.txt") # put model in the same directory parser.add_argument('--load_model', default=True) parser.add_argument('--load_token', default=True, type=bool) parser.add_argument('--save_dir', default='model/') # log dir for tensorboard parser.add_argument('--log_dir', default='log_dir/') args = parser.parse_args() test_path = args.test_file save_path = 'token/' #load token path if args.load_token is not None: load_path = os.path.join(save_path) sess = tf.Session() #####read data##### dm = DataManager() print('Loading test data...') dm.add_data('test_data', test_path, with_label=False) # prepare tokenizer print('get Tokenizer...') if args.load_token is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: raise Exception("Word token is not loaded...") # convert to sequences dm.to_sequence(args.max_length) # Create the graph object tf.reset_default_graph() # initial model print('initial model...') rnnmodel = simpleRNN(args) with tf.name_scope('inputs'): #create placeholder for training (testing) data X_ = tf.placeholder(tf.int32, [None, args.max_length], name='X') #y_ = tf.placeholder(tf.int32, [args.batch_size, ], name='y_') keep_prob = tf.placeholder_with_default(1.0, shape=(), name="keep_prob") y_predict = rnnmodel.model(args, X_, keep_prob) #initial state of LSTM init_state = rnnmodel.initial_state #check outputs of LSTM routputs = rnnmodel.outputs if args.load_model is not None: load_path = os.path.join(args.save_dir) path = os.path.join(load_path, 'Sentimen_rnn_final') if os.path.exists(path + ".meta"): print('load model from %s' % path) #model.load_weights(path) change to tensorflow model else: raise ValueError("Can't find the file %s" % path) #elif args.action == 'test' : X = dm.get_data('test_data') print("Load test data (shape {})".format(X.shape)) #raise Exception ('Implement your testing function') init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: #if pre-trained, load embedding matrix """ if (args.pretrain_emb==True): emb_npfn = save_path+args.emb_matrix emb_matrix = np.load(emb_npfn)['embed_m'] if (emb_matrix.shape[0]!= args.vocab_size or emb_matrix.shape[1]!=args.embedding_dim): print("Import embedding matrix shape {} does not match shape of ({},{})...".format(emb_matrix.shape, args.vocab_size, args.embedding_dim)) exit(1) else: print("Loading embedding matrix.....") sess.run(rnnmodel.embedding_mat.assign(emb_matrix)) """ saver.restore(sess, path) test_state = sess.run([init_state]) #for Xs in X: test_dict = {X_: X, keep_prob: 1, init_state: test_state} test_predict = sess.run(y_predict, feed_dict=test_dict) if (test_predict > args.threshold): print("Predicted result is OHCA.") else: print("Predicted result is not OHCA.") with open(args.outfile, 'w+') as outfile: outstr = "%f" % (test_predict[0]) print(outstr) outfile.write(outstr) return
def cbow_main(): parser = argparse.ArgumentParser(description='CBOW word embedding') #training argument parser.add_argument('--vocab_size', default=50000, type=int) parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int) parser.add_argument('--gpu_fraction', default=0.8, type=float) parser.add_argument('--skip_window', default=2, type=int) parser.add_argument('--num_skips', default=4, type=int) parser.add_argument('--batch_size', default=512, type=int) parser.add_argument('--learning_rate', default=0.01, type=float) parser.add_argument('--log_dir', default='log_embdir/') parser.add_argument('--nsteps', default=5000000, type=int) # put model in the same directory parser.add_argument('--load_model', default=None) parser.add_argument('--load_token', default=None, type=bool) parser.add_argument('--save_embed', default='cbowemb.npz') args = parser.parse_args() mlclass_path = 'data/all_sents.txt' script_path = 'data/simu_script.txt' pylady_path = 'data/corpusclean_news_pylady.txt' pttgossi_path = 'data/ptt_gossiping_201611_post_cleanf.csv' #semi_path = 'data/training_nolabel.txt' save_path = 'token/' #load token path if args.load_token is not None: load_path = os.path.join(save_path) # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = get_session(args.gpu_fraction) #read all data for tokenizer (train, semi, test) dm = DataManager() print('Loading training data...') dm.add_data('ml_data', mlclass_path, False, False) dm.add_data('script_data', script_path, False, False) dm.add_data('pylady_data', pylady_path, False, False) dm.add_data('pttgossi_data', pttgossi_path, False, False) # prepare tokenizer print('get Tokenizer...') if args.load_token is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # prepare sequence to text dict reverse_word_dict = dict(map(reversed, dm.tokenizer.word_index.items())) # CBOW embedding [skip_window target skip_window] # context_size = args.skip_window*2 # convert to sequences without pre-padding (list, not np.array) #dm.to_sequence(args.max_length) dm.to_sequence_nopad() # fill all sequence data into a list seq_data = [] seq_data.extend(dm.get_data('ml_data')[0]) seq_data.extend(dm.get_data('script_data')[0]) seq_data.extend(dm.get_data('pylady_data')[0]) seq_data.extend(dm.get_data('pttgossi_data')[0]) #seq_data.extend(dm.get_data('test_data')[0]) # Create the graph object tf.reset_default_graph() # pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. #valid_examples = np.random.choice(valid_window, valid_size, replace=False) valid_text = [ "喘", "呼吸", "白沫", "沒有", "意識", "倒下", "電話", "臉色", "起伏", "睡著", "昏倒", "溺水", "清醒", "不", "微弱", "很" ] #print(dm.tokenizer.texts_to_sequences(valid_text)) valid_examples = np.array([ words[0] for words in (dm.tokenizer.texts_to_sequences(valid_text)) if len(words) > 0 ]) #print(valid_examples) #valid_examples = np.array(random.sample(range(valid_window), valid_size)) with tf.name_scope('inputs'): #create placeholder for training (testing) data X_ = tf.placeholder(tf.int32, [args.batch_size, args.num_skips], name='X_') y_ = tf.placeholder(tf.int32, [args.batch_size, 1], name='y_') valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #embedding here with tf.name_scope("embeddings"): embedding_mat = tf.get_variable('embedding_mat', [args.vocab_size, args.embedding_dim], tf.float32, tf.random_normal_initializer()) #embedding num_skips words embedding = tf.zeros([args.batch_size, args.embedding_dim]) for j in range(args.num_skips): embedding += tf.nn.embedding_lookup(embedding_mat, X_[:, j]) with tf.name_scope("softmax"): soft_weights = tf.get_variable('soft_weights', [args.vocab_size, args.embedding_dim], tf.float32, tf.random_normal_initializer()) soft_biases = tf.get_variable('soft_biases', [args.vocab_size], tf.float32, tf.constant_initializer(0.0)) num_sampled = 64 # Compute the loss with tf.name_scope('loss'): # tf.nn.nce_loss loss = tf.reduce_mean( tf.nn.nce_loss(weights=soft_weights, biases=soft_biases, labels=y_, inputs=embedding, num_sampled=num_sampled, num_classes=args.vocab_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) with tf.name_scope('optimizer'): optimizer = tf.train.AdagradOptimizer( args.learning_rate).minimize(loss) # Compute the similarity between minibatch examples and all embeddings norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_mat), 1, keep_dims=True)) #normalized embedding matrix by its summation of squre element value, then take squre root normalized_embeddings = embedding_mat / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) # Merge all summaries. merged = tf.summary.merge_all() # variable initializer init = tf.initialize_all_variables() #tensorflow model saver saver = tf.train.Saver(tf.global_variables()) writer = tf.summary.FileWriter(args.log_dir, sess.graph) average_loss = 0.0 data_index = 0 seq_index = 0 with tf.Session() as sess: # start to training sess.run(init) for step in range(args.nsteps): batch_X, batch_y, data_index, seq_index = generate_batch_cbow( seq_data, data_index, seq_index, args.batch_size, args.num_skips, args.skip_window) feed_dict = {X_: batch_X, y_: batch_y} op, lo = sess.run([optimizer, loss], feed_dict=feed_dict) average_loss += lo if (step % 2000 == 0): if (step > 0): average_loss = average_loss / 2000 # The average loss is an estimate of the loss over the last 2000 batches. print('Average loss at step %d: %f' % (step, average_loss)) average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps) if (step % 10000 == 0): sim = similarity.eval() for i in range(valid_size): try: valid_word = reverse_word_dict[valid_examples[i]] except KeyError: print("Skip word...") top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % valid_word for k in range(top_k): try: close_word = reverse_word_dict[nearest[k]] log = '%s %s,' % (log, close_word) except KeyError: print("Skip nearest {}-th word".format(k)) #print once for each word print(log) # final_embeddings = self.normalized_embeddings.eval() #final_embeddings = normalized_embeddings.eval() final_embeddings = embedding_mat.eval() # Save the model for checkpoints. saver.save(sess, os.path.join(args.log_dir, 'embmodel.ckpt')) writer.close() #save the embedding mapping matrix save_fn = save_path + args.save_embed np.savez(save_fn, embed_m=final_embeddings) return