def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir,args.load_model) #####read data##### dm = DataManager() print ('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'train_corpus': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists('./model/token_25k.pk'): dm.save_tokenizer('./model/token_25k.pk') embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim) dm.to_sequence(args.max_length) # initial model print ('initial model...') model = simpleRNN(args,embedding_w) model.summary() if args.load_model is not None: if args.action == 'train': print ('Warning : load a exist model and keep training') path = os.path.join(load_path,'model.h5') if os.path.exists(path): print ('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" %path) elif args.action == 'test': print ('Warning : testing without loading any model') # training if args.action == 'train_corpus': (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, verbose=1, shuffle= True, callbacks=[checkpoint, earlystopping] ) # plot_figure(history) # semi-supervised training elif args.action == 'semi': earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) # repeat 10 times (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) dm.clean_data() dm.add_data('train_data', train_path, True) dm.add_data('test_data',test_path, False) dm.to_sequence(args.max_length) semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print ('-- semi_data size: %d' %(len(semi_X))) model = simpleRNN(args,embedding_w) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=40, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping] ) plot_figure(history)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'test': dm.add_data('test_data', test_path, False) else: dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) # prepare tokenizer print('get Tokenizer...') if args.action == 'token': dm.tokenize() else: # read exist tokenizer dm.load_tokenizer(args.token) '''else: # create tokenizer on new data dm.tokenize()''' dm.save_tokenizer(args.token) # convert to sequences if args.action != 'token': dm.to_sequence(args.max_length) # initial model if args.action != 'token': print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': X = dm.get_data('test_data')[0] predict = model.predict(X) result = [['id', 'label']] for i in range(len(predict)): a = [i] if predict[i][0] > 0.5: a.append(1) else: a.append(0) #a.append(predict[i][0]) #test #a.append(predict[i]) result.append(a) i += 1 cout = csv.writer(open(args.result_path, 'w')) cout.writerows(result) #implement after ensure output format # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times #for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) #print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=20, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) #save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) print('load_path:', load_path) #####read data##### dm = DataManager() w2v_path = os.path.join(args.save_dir, 'word2vec') print(w2v_path) if args.action == 'train': print('Loading data...') dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') train_data = dm.get_data('train_data') semi_data = dm.get_data('semi_data') all_text = np.concatenate((train_data[0], semi_data[0], test_data), axis=0) print('Number of all_text:', all_text.shape[0]) #print('Text sample:',all_text[0]) print('Converting texts to words sequence...') text2word = [] with_filter = 0 if with_filter: for text in all_text: text2word.append( text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")) if not with_filter: for text in all_text: text2word.append( text_to_word_sequence(text, filters='', lower=True, split=" ")) print('Word sequence sample:', text2word[0]) if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) else: print('Building word2vec model...') word_vec = gensim.models.Word2Vec(text2word, size=128, min_count=15) print('Vocabulary size:', len(word_vec.wv.vocab)) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'word2vec')): word_vec.save((os.path.join(save_path, 'word2vec'))) print('Coverting train_data to vector...') index_data = [] i = 0 for line in train_data[0]: index_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_data[i].append(word_vec.wv.vocab[word].index) i += 1 embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector index_data = pad_sequences(index_data, args.max_length) else: if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: print('Can not load w2v model, please training w2v model first!') #print ('get Tokenizer...') #if args.load_model is not None: # # read exist tokenizer # dm.load_tokenizer(os.path.join(load_path,'token.pk')) #else: # # create tokenizer on new data # dm.tokenize(args.vocab_size) # #if not os.path.isdir(save_path): # os.makedirs(save_path) #if not os.path.exists(os.path.join(save_path,'token.pk')): # dm.save_tokenizer(os.path.join(save_path,'token.pk')) # # mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count') # mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count') # convert to sequences #dm.to_sequence(args.max_length) # initial model print('initial model...') #model = bow_model(args,mat_train_data) model = simpleRNN(args, embedding_matrix) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') #path = os.path.join(load_path,'model.h5') if os.path.exists(load_path): print('load model from %s' % load_path) model.load_weights(load_path) else: raise ValueError("Can't find the file %s" % load_path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) X, X_val, Y, Y_val = train_test_split(index_data, train_data[1], test_size=0.33, random_state=42) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) print(history.history.keys()) print('Val_acc:', history.history['val_acc']) print('Train_acc:', history.history['acc']) # testing elif args.action == 'test': dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') # Covert to vector index_test_data = [] i = 0 for line in test_data: index_test_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_test_data[i].append(word_vec.wv.vocab[word].index) i += 1 index_test_data = pad_sequences(index_test_data, args.max_length) if not os.path.exists(args.result_path): os.makedirs(args.result_path) csv_path = os.path.join(args.result_path, 'prediction.csv') print("Predicting testing data...") Y_pred = model.predict(index_test_data) Y_pred = np.round(Y_pred) print('Saving result csv to', csv_path) with open(csv_path, 'w') as f: f.write('id,label\n') for i, v in enumerate(Y_pred): f.write('%d,%d\n' % (i, v)) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(5): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=256, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # raise Exception ('Implement your testing parser') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # dm.to_bow() # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) plot(history, args.model) # plot_model(model, to_file='./img/structure.png') # testing elif args.action == 'test': X = dm.get_data('test_data') print('Predict testing data...') result = model.predict(X) print('Save result...') saveResult(result, args.result_path) # raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = args.save_dir if args.load_model is not None: load_path = args.save_dir #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', test_path) else: dm.add_test_data('test_data', test_path) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': print(model.summary()) [test_x] = dm.get_data('test_data') classes = model.predict(test_x, batch_size=32) with open(args.output_path, "w", encoding='utf-8') as f: spamwriter = csv.writer(f, delimiter=',') spamwriter.writerow(['id', 'label']) for i in range(len(classes)): if classes[i][0] < 0.5: result = 0 else: result = 1 spamwriter.writerow([str(i), str(result)]) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') [test_x] = dm.get_data('test_data') semi_all_X = np.concatenate((semi_all_X, test_x), axis=0) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(16): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): dm = DataManager() dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) print('Get Tokenizer...') dm.load_tokenizer('./token/token.pk') embedding_mat = dm.to_sequence(40, action) print('Initial model...') if action == 'train': model = RNN(embedding_mat) print(model.summary()) elif action == 'semi': model = load_model('./model/model1.hdf5') print(model.summary()) if action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) earlystopping = EarlyStopping(monitor='val_acc', patience=30, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') model.fit(X, Y, validation_data=(X_val, Y_val), epochs=80, batch_size=512, callbacks=[checkpoint, earlystopping]) elif action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') for i in range(10): semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=512, callbacks=[checkpoint, earlystopping]) print('load model from') model = load_model('./model/model_semi.hdf5')