def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ tweets = X[0, :] snippets = X[1, :] targets = X[2, :] print("tweets's shape = ", tweets.shape) print("snippets's shape = ", snippets.shape) print("targets's shape = ", targets.shape) print("Y's shape = ", Y.shape) #model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output]) history = model.fit( [tweets, snippets, targets], Y, validation_data=([X_val[0, :], X_val[1, :], X_val[2, :]], Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) predictions = model.predict([tweets, snippets, targets]) #print(predictions.shape) #print(predictions) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] #print("tweets.shape =", tweets.shape) #print("snippets.shape =", snippets.shape) #print("targets.shape =", targets.shape) predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #print(predictions) #print(Y.shape) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("test data mse by keras = %f" % scores[1]) print("test data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("test data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("test data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) #print("test data scores[1](loss = mse) = %f" % scores[1]) #raise Exception ('Implement your testing function') (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) tweets = X[0, :] snippets = X[1, :] targets = X[2, :] predictions = model.predict([tweets, snippets, targets]) preidctions = predictions.reshape(-1) #scores = np.sum((predictions - Y)**2)/len(Y) scores = model.evaluate([tweets, snippets, targets], Y) print("train data mse by keras = %f" % scores[1]) print("train data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("train data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("train data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('train_data', train_path, True) dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') """ # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) """ if not os.path.isdir(save_path): os.makedirs(save_path) """ if not os.path.exists(os.path.join(save_path,'token.pk')): dm.save_tokenizer(os.path.join(save_path,'token.pk')) """ # convert to sequences token_corpus = dm.to_token_corpus(args.max_length) #word2vec = to_word2vec(token_corpus) if args.action == "train": word2vec = to_word2vec(token_corpus) save_path_word2vec_model = os.path.join(save_path, 'word2vec.model') word2vec.save(save_path_word2vec_model) elif args.action == "test": path = os.path.join(load_path, 'word2vec.model') if os.path.exists(path): print('load model from %s' % path) word2vec = Word2Vec.load(path) else: raise ValueError("Can't find the file %s" % path) word2vec = word2vec.wv #print(word2vec['downgrades']) #padding sentence dm.padding_sent(args.max_length) dm.sent_to_word2vec(word2vec) #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) # initial model print('initial model...') model = simpleRNN(args) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #print(type(X)) #print(type(X[0])) #print(X[0][0]) #print(X) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') #X, Y, X_val, Y_val = np.array(X), np.array(Y), np.array(X_val), np.array(Y_val) #print(X) #print(X[0]) #X_val = np.reshape(X_val, (X_val.shape[0], args.max_length, X_val.shape[2])) save_path_model_h5 = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) model.save(save_path_model_h5) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) predictions = model.predict(X) predictions = predictions.reshape(-1) scores = model.evaluate(X, Y) print("test data mse by keras = %f" % scores[1]) print("test data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("test data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("test data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) predictions = model.predict(X) predictions = predictions.reshape(-1) scores = model.evaluate(X, Y) print("train data mse by keras = %f" % scores[1]) print("train data mse by sklearn = %f" % mean_squared_error(Y, predictions)) for idx, value in enumerate(predictions): if value > 0: predictions[idx] = 1 elif value == 0: predictions[idx] = 0 elif value < 0: predictions[idx] = -1 for idx, value in enumerate(Y): if value > 0: Y[idx] = 1 elif value == 0: Y[idx] = 0 elif value < 0: Y[idx] = -1 print("train data micro f1 score by sklearn = %f" % f1_score(Y, predictions, average='micro')) print("train data macro f1 score by sklearn = %f" % f1_score(Y, predictions, average='macro')) #raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #处理数据 #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) #初始化模型 # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') #训练过程 # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') #创建一个实例history history = LossHistory() hist = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) #绘制acc-loss曲线 history.loss_plot('epoch') #测试过程 # testing elif args.action == 'test': id = dm.data['test_data'][1] out = model.predict(dm.data['test_data'][0]) out = np.squeeze(out) out[out <= 0.5] = 0 out[out > 0.5] = 1 out = out.astype(int) print("pred shape:", np.array(out).shape) print("id shape:", np.array(id).shape) result = pd.concat( [pd.DataFrame({'id': id}), pd.DataFrame({'sentiment': out})], axis=1) wd = pd.DataFrame(result) wd.to_csv("submission.csv", index=None) newZip = zipfile.ZipFile('submission.zip', 'w') newZip.write('submission.csv', compress_type=zipfile.ZIP_DEFLATED) newZip.close() #半监督训练过 # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) history = LossHistory() # train hist = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping, history]) history.loss_plot('epoch') if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'test': dm.add_data('test_data', test_path, True) else: raise Exception('Action except for train, semi, and test') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # prepare glove embedding embedding_matrix = preEB(dm) # initial model print('initial model...') model = simpleRNN(args, embedding_matrix, dm.tokenizer.word_index) model.summary() print("args.load_model =", args.load_model) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': #print ('Warning : testing without loading any model') print('args.action is %s' % (args.action)) path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') """ checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max' ) """ history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size) #, #callbacks=[checkpoint, earlystopping] ) model.save(save_path) # testing elif args.action == 'test': args.val_ratio = 0 (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio) pred = model.predict(X) scores = model.evaluate(X, Y) print("test data scores(loss = mse) = %f" % scores[1]) print("mse: ", evaluation(pred, Y, 'mse')) print("micro: ", evaluation(pred, Y, 'f1_micro')) print("macro: ", evaluation(pred, Y, 'f1_macro')) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)