def main(): params = {'batch_size': 64} modelname = argv[1] #Datasets partition = load_partition() print(len(partition['train'])) print(len(partition['validation'])) training_generator = DataGenerator(partition['train'], **params) validation_generator = DataGenerator(partition['validation'], **params) dm = DataManager() dm.load_tokenizer('/mnt/data/b04901058/recsys/token0_Xfull.pk') word_index, embedding_matrix = dm.embedding_matrix() cnn_model = cnn0(word_index, embedding_matrix) cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) checkpoint = [ ModelCheckpoint( modelname, # model filename monitor='val_loss', # quantity to monitor verbose=0, # verbosity - 0 or 1 save_best_only=True, # The latest best model will not be overwritten mode='auto'), # The decision to overwrite model is made EarlyStopping(monitor='val_loss', patience=3, verbose=0) ] cnn_model.fit_generator(generator=training_generator, validation_data=validation_generator, callbacks=checkpoint, verbose=1, use_multiprocessing=True, epochs=12, workers=3)
def main(): """ Main function of test.py Arguments: modelname: String, name of the model datapath: The testing file subtask: String, "A" or "B" or "C" Outputs: subtask + [subtask]/result/[modelname]/res.pred """ modelname = args.modelname datapath = args.datapath subtask = args.subtask dm = DataManager(subtask) dm.load_tokenizer( os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"), os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl")) dm.add_data("test", datapath) dm.to_sequence(40, 40) (test_Q, test_C), qidlist = dm.get_data("test") print("test_Q", test_Q[0:2]) print("test_C", test_C[0:2]) print("qidlist", qidlist[0:2]) model = load_model( os.path.join("subtask" + subtask, "models", modelname, "model.h5")) result = model.predict([test_Q, test_C], batch_size=128, verbose=1) print("result", result[0:2]) if subtask == "A": outputA(qidlist, result, modelname) elif subtask == "B": outputB(qidlist, result, modelname) elif subtask == "C": outputC(qidlist, result, modelname)
def main(): path_pfx = '' max_len = 37 dm = DataManager() dm.add_data('test', os.path.join(sys.argv[1]), False, True) print(len(dm.data['test'][0])) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl')) dm.to_sequence(max_len, use_pretrain=True) result = predict(dm.data['test'][0], path_pfx) write(sys.argv[2], result) print('finished')
class input: path = DataManager.get_path() if os.path.exists(path + 'test_images/'): path += 'test_images/' else: path += 'train_images/' print("Path to images: ", path) if glob.glob(path+'*.jpeg'): tiff_format = False else: tiff_format = True print("Image tiff-format: ", tiff_format) tiff_level = 1 # only if tiff_format is Ture resize_ratio = 1 # 1 (N x N) or 2 (-> N//2 x N//2) input_shape = (1280, 1280, 3) patch_size = 256 sample_size = 25 preprocess_mode = 'float' objective = 'bce' label_smoothing = 0.0 # only if objective is 'cce'
def main(): voc_size = None max_len = 39 path_pfx = '' dm = DataManager() dm.add_data('train', sys.argv[1]) #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False) #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy')) dm.to_sequence(max_len, use_pretrain=True) #dm.to_bow() print(max_len) #emb_mat = dm.get_embedding_matrix() emb_mat = None train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
def main(argv): filename = argv[1] output_path = argv[2] output_path = output_path.replace('\r', '') output_path = output_path.replace('\r\n', '') dm = DataManager() dm.add_data('test_data', filename, False) dm.load_tokenizer('./model/token_25k.pk') dm.to_sequence(40) model = load_model('./model/00017-0.82720.h5') model.summary() val_proba = model.predict(dm.data['test_data']) val_classes = [1 if value > 0.5 else 0 for value in val_proba] out = pd.DataFrame(val_classes, columns=['label']) out.to_csv(output_path, index_label='id')
def argument_parser(L): token = L[1] dm = DataManager() dm.add_data('data/data.csv') X = dm.get_data('data') Y = dm.get_data('label') data = X[0] label = Y[0] logpath = os.path.join('log') if not os.path.exists(logpath): os.makedirs(logpath) if token == 'LinR': MSE, MAE = train(data, label, token) with open('log/LinR.csv', 'w') as f: f.write('MSE,MAE\n') f.write('{},{}\n'.format(MSE, MAE)) else: bin_size = int(L[2]) acc, pre, rec, f_score = train(data, label, token, bin_size=bin_size) with open('log/' + token + '-bins-' + str(bin_size) + '.csv', 'w') as f: f.write('accuracy,precision,recall,f-score\n') f.write('{},{},{},{}\n'.format(acc, pre, rec, f_score))
def get_similar(self): return dm.get_model('model')
def get_ref_files(self): return dm.get_csv('indices', index=0), dm.get_csv('titles')
fold = Config.train.fold batch_size = Config.train.batch_size epochs = Config.train.epochs lr_max = Config.train.learning_rate.max lr_min = Config.train.learning_rate.min lr_decay_epochs = Config.train.learning_rate.decay_epochs lr_warmup_epochs = Config.train.learning_rate.warmup_epochs lr_power = Config.train.learning_rate.power units = Config.model.units dropout = Config.model.dropout activation = Config.model.activation train_data, valid_data = DataManager.get_train_data(split=True, test_size=test_size, random_state=random_state, add_image_size_info=True) lr_steps_per_epoch = math.ceil(len(train_data) / Config.train.batch_size) train_dataset = get_dataset( dataframe=train_data, input_path=input_path, batch_size=batch_size, training=True, augment='heavy', tta=1, input_size=input_shape, objective=objective, buffer_size=8192, cache=False,
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'test': dm.add_data('test_data', test_path, False) else: dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) # prepare tokenizer print('get Tokenizer...') if args.action == 'token': dm.tokenize() else: # read exist tokenizer dm.load_tokenizer(args.token) '''else: # create tokenizer on new data dm.tokenize()''' dm.save_tokenizer(args.token) # convert to sequences if args.action != 'token': dm.to_sequence(args.max_length) # initial model if args.action != 'token': print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': X = dm.get_data('test_data')[0] predict = model.predict(X) result = [['id', 'label']] for i in range(len(predict)): a = [i] if predict[i][0] > 0.5: a.append(1) else: a.append(0) #a.append(predict[i][0]) #test #a.append(predict[i]) result.append(a) i += 1 cout = csv.writer(open(args.result_path, 'w')) cout.writerows(result) #implement after ensure output format # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times #for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) #print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=20, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): dm = DataManager() dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) print('Get Tokenizer...') dm.load_tokenizer('./token/token.pk') embedding_mat = dm.to_sequence(40, action) print('Initial model...') if action == 'train': model = RNN(embedding_mat) print(model.summary()) elif action == 'semi': model = load_model('./model/model1.hdf5') print(model.summary()) if action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) earlystopping = EarlyStopping(monitor='val_acc', patience=30, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') model.fit(X, Y, validation_data=(X_val, Y_val), epochs=80, batch_size=512, callbacks=[checkpoint, earlystopping]) elif action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') for i in range(10): semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=512, callbacks=[checkpoint, earlystopping]) print('load model from') model = load_model('./model/model_semi.hdf5')
def main(): # limit gpu memory usage train_path = argv[1] semi_path = argv[2] #K.set_session(get_session(gpu_fraction)) #####read data##### dm = DataManager() print ('Loading data...') if action == 'train': dm.add_data('train_data', train_path, True) #dm.add_data('semi_data', semi_path, False) elif action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if not os.path.exists(tokenizer_save_path): dm.tokenize(20000) dm.save_tokenizer(tokenizer_save_path) else: dm.load_tokenizer(tokenizer_save_path) # Word2Vec print ('get Word2Vec...') data_dic = dm.get_data() tokenizer = dm.get_tokenizer() #vocab_size = len(tokenizer.word_index)+1 #data_list = data_dic['train_data'][2]+data_dic['semi_data'][1] #data_list = data_dic['train_data'] #w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16) #w2v_model.save(word2vec_save_path) #w2v_model = Word2Vec.load(word2vec_save_path) w2v_model=pk.load(open('emb.pkl','rb')) # convert to sequences dm.to_sequence(max_length) #dm.to_bow() # initial model print ('initial model...') model = simpleRNN() print (model.summary()) labelnum = [] # training if action == 'train': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping]) # semi-supervised training elif action == 'semi': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) semi_all_X = dm.get_data()['semi_data'][0] X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer) X = np.array(X) X_val = np.array(X_val) semi_all_X = np.array(semi_all_X) earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold) labelnum.append(semi_X.shape) semi_X = np.concatenate((semi_X, X),axis=0) semi_Y = np.concatenate((semi_Y, Y),axis=0) print ('-- iteration %d semi_data size: %d' %(i+1,len(semi_X))) # train history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] ) if os.path.exists(model_save_path): print ('load model from %s' % model_save_path) model.load_model(model_save_path) else: raise ValueError("Can't find the file %s" %path) '''
def new_process_xy(tokenpath,path2x,path2y): dm = DataManager() dm.add_data('seed', '0samples.csv') dm.add_data('truth', '0samples.csv') dm.tokenize(230000) #vocab size dm.save_tokenizer(tokenpath) dm.to_sequence(1) #max length dm.save_sequence(path2x) dm.tosave_label(path2y)
import sys import keras import _pickle as pk import numpy as np from keras.models import Model, Sequential, load_model from util import DataManager # argv settings test_path = sys.argv[1] output_path = sys.argv[2] mode = sys.argv[3] # load data dm = DataManager() dm.add_data('test_data',test_path,False) if mode=='private': # tokenizer dm.load_tokenizer('./token/token.pk') # load model model = load_model('./model/model1.hdf5') elif mode=='public': # tokenizer dm.load_tokenizer('./token/token_filter.pk') # load model model = load_model('./model/model2.hdf5') dm.to_sequence(40,'test') test_all_x = dm.get_data('test_data')
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir,args.load_model) #####read data##### dm = DataManager() print ('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'train_corpus': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists('./model/token_25k.pk'): dm.save_tokenizer('./model/token_25k.pk') embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim) dm.to_sequence(args.max_length) # initial model print ('initial model...') model = simpleRNN(args,embedding_w) model.summary() if args.load_model is not None: if args.action == 'train': print ('Warning : load a exist model and keep training') path = os.path.join(load_path,'model.h5') if os.path.exists(path): print ('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" %path) elif args.action == 'test': print ('Warning : testing without loading any model') # training if args.action == 'train_corpus': (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, verbose=1, shuffle= True, callbacks=[checkpoint, earlystopping] ) # plot_figure(history) # semi-supervised training elif args.action == 'semi': earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) # repeat 10 times (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) dm.clean_data() dm.add_data('train_data', train_path, True) dm.add_data('test_data',test_path, False) dm.to_sequence(args.max_length) semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print ('-- semi_data size: %d' %(len(semi_X))) model = simpleRNN(args,embedding_w) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=40, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping] ) plot_figure(history)
import torch from torch.utils.data import DataLoader from util import DataManager, AutoEncoder, AEDataset import argparse parser = argparse.ArgumentParser(description='DLCV HW5') #parser.add_argument('-p','--problem', dest='problem',type=int,required=True) args = parser.parse_args() TENSORBOARD_DIR = './runs/train' dm = DataManager(tensorboard_dir=TENSORBOARD_DIR) EPOCH = 50 BATCH_SIZE = 128 LABEL_DIM = 11 DROPOUT = 0.5 LEARNING_RATE = 1E-3 PRETRAIN = True OUTPUT_PATH = './model/pretrained.pt' OUTPUT_CHARACTER = 'data/character.txt' train_path = ['./data/trainx.npy', './data/trainy.npy'] val_path = ['./data/valx.npy', './data/valy.npy'] val_data = dm.readfile('./dataset/val', './dataset/val_id.txt', save_path=val_path) train_data = dm.readfile('./dataset/train/', './dataset/train_id.txt', save_path=train_path) #dm.character.save(OUTPUT_CHARACTER)
import jieba jieba.dt.cache_file = 'jieva.cache.new' import numpy as np from util import DataManager, Vocabulary max_word_len = 14 word_dim_list = [50, 100, 150, 200, 250, 300, 350, 400] test = np.zeros((5060, 6)) for word_dim in word_dim_list: print('word dim=', word_dim) dm = DataManager() voc = Vocabulary() dm.word_dim = word_dim dm.word_len = max_word_len voc.word2vec('data/w2v_model/w2v_model_{}'.format(word_dim)) print("reading data...", end='') dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option') print("\rreading data...finish") print("construct data...") dm.construct_data_seq2seq('test_question', voc, 'data/test_question.npy') dm.construct_data_seq2seq('test_option', voc, 'data/test_option.npy', multi_seq=True) print("construct data...finish") print('test_question_seq.shape: ' + str(dm.data['test_question'].shape)) print('test_option.shape: ' + str(dm.data['test_option'].shape))
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) #save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) print('load_path:', load_path) #####read data##### dm = DataManager() w2v_path = os.path.join(args.save_dir, 'word2vec') print(w2v_path) if args.action == 'train': print('Loading data...') dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') train_data = dm.get_data('train_data') semi_data = dm.get_data('semi_data') all_text = np.concatenate((train_data[0], semi_data[0], test_data), axis=0) print('Number of all_text:', all_text.shape[0]) #print('Text sample:',all_text[0]) print('Converting texts to words sequence...') text2word = [] with_filter = 0 if with_filter: for text in all_text: text2word.append( text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")) if not with_filter: for text in all_text: text2word.append( text_to_word_sequence(text, filters='', lower=True, split=" ")) print('Word sequence sample:', text2word[0]) if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) else: print('Building word2vec model...') word_vec = gensim.models.Word2Vec(text2word, size=128, min_count=15) print('Vocabulary size:', len(word_vec.wv.vocab)) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'word2vec')): word_vec.save((os.path.join(save_path, 'word2vec'))) print('Coverting train_data to vector...') index_data = [] i = 0 for line in train_data[0]: index_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_data[i].append(word_vec.wv.vocab[word].index) i += 1 embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector index_data = pad_sequences(index_data, args.max_length) else: if os.path.exists(w2v_path): print('Loading w2v_model from %s' % w2v_path) word_vec = gensim.models.Word2Vec.load(w2v_path) print('Vocabulary size:', len(word_vec.wv.vocab)) embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128)) for i in range(len(word_vec.wv.vocab)): embedding_vector = word_vec.wv[word_vec.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: print('Can not load w2v model, please training w2v model first!') #print ('get Tokenizer...') #if args.load_model is not None: # # read exist tokenizer # dm.load_tokenizer(os.path.join(load_path,'token.pk')) #else: # # create tokenizer on new data # dm.tokenize(args.vocab_size) # #if not os.path.isdir(save_path): # os.makedirs(save_path) #if not os.path.exists(os.path.join(save_path,'token.pk')): # dm.save_tokenizer(os.path.join(save_path,'token.pk')) # # mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count') # mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count') # convert to sequences #dm.to_sequence(args.max_length) # initial model print('initial model...') #model = bow_model(args,mat_train_data) model = simpleRNN(args, embedding_matrix) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') #path = os.path.join(load_path,'model.h5') if os.path.exists(load_path): print('load model from %s' % load_path) model.load_weights(load_path) else: raise ValueError("Can't find the file %s" % load_path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) X, X_val, Y, Y_val = train_test_split(index_data, train_data[1], test_size=0.33, random_state=42) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) print(history.history.keys()) print('Val_acc:', history.history['val_acc']) print('Train_acc:', history.history['acc']) # testing elif args.action == 'test': dm.add_test_data('test_data', args.test_path) test_data = dm.get_test_data('test_data') # Covert to vector index_test_data = [] i = 0 for line in test_data: index_test_data.append([]) for word in line.split(): if word in word_vec.wv: #print(word ,word_vec.wv.vocab[word].index) index_test_data[i].append(word_vec.wv.vocab[word].index) i += 1 index_test_data = pad_sequences(index_test_data, args.max_length) if not os.path.exists(args.result_path): os.makedirs(args.result_path) csv_path = os.path.join(args.result_path, 'prediction.csv') print("Predicting testing data...") Y_pred = model.predict(index_test_data) Y_pred = np.round(Y_pred) print('Saving result csv to', csv_path) with open(csv_path, 'w') as f: f.write('id,label\n') for i, v in enumerate(Y_pred): f.write('%d,%d\n' % (i, v)) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(5): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=256, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # raise Exception ('Implement your testing parser') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # dm.to_bow() # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) plot(history, args.model) # plot_model(model, to_file='./img/structure.png') # testing elif args.action == 'test': X = dm.get_data('test_data') print('Predict testing data...') result = model.predict(X) print('Save result...') saveResult(result, args.result_path) # raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = args.save_dir if args.load_model is not None: load_path = args.save_dir #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', test_path) else: dm.add_test_data('test_data', test_path) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': print(model.summary()) [test_x] = dm.get_data('test_data') classes = model.predict(test_x, batch_size=32) with open(args.output_path, "w", encoding='utf-8') as f: spamwriter = csv.writer(f, delimiter=',') spamwriter.writerow(['id', 'label']) for i in range(len(classes)): if classes[i][0] < 0.5: result = 0 else: result = 1 spamwriter.writerow([str(i), str(result)]) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') [test_x] = dm.get_data('test_data') semi_all_X = np.concatenate((semi_all_X, test_x), axis=0) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(16): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
import keras from keras.callbacks import ModelCheckpoint, EarlyStopping assert jieba and np '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''' setting option ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' n_batch = 1024 n_epoch = 30 max_word_len = 13 word_dim = 300 adam = keras.optimizers.Adam(clipnorm=0.0001) adamax = keras.optimizers.Adamax(clipnorm=0.0001) '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''' create model ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' dm = DataManager() voc = Vocabulary() dm.word_dim = word_dim dm.word_len = max_word_len voc.word2vec('data/w2v_model') print("reading data...", end='') dm.read_train_data('data/training_data/1_train.txt', 'train1') dm.read_train_data('data/training_data/2_train.txt', 'train2') dm.read_train_data('data/training_data/3_train.txt', 'train3') dm.read_train_data('data/training_data/4_train.txt', 'train4') dm.read_train_data('data/training_data/5_train.txt', 'train5') dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option') print("\rreading data...finish") print(dm.data['train1'][:3])
'''''' '''''' '''''' ''' setting option ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' parser = argparse.ArgumentParser(description='Handle input model.') parser.add_argument('--model', dest='model', type=str, required=True) args = parser.parse_args() continue_file = args.model n_batch = 4096 max_word_len = 14 word_dim = 300 adam = keras.optimizers.Adam(clipnorm=0.0001) adamax = keras.optimizers.Adamax(clipnorm=0.0001) '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''' create model ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' dm = DataManager() voc = Vocabulary() dm.word_dim = word_dim dm.word_len = max_word_len voc.word2vec('data/w2v_model') print("reading data...", end='') dm.read_train_data('data/training_data/1_train.txt', 'train1') dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option') print("\rreading data...finish") print(dm.data['test_question'][:6]) print("construct data...") dm.construct_data_seq2seq('train1', voc, 'data/train1.npy') dm.construct_data_seq2seq('test_question', voc, 'data/test_question.npy')
img_masked = remove_penmarks(img_path) img_masked = Image.fromarray(img_masked) #img_masked.save(img_path, subsampling=0, quality=100) def main(path, marked_images): paths = path + marked_images + '.jpeg' with multiprocessing.Pool() as pool: for c in tqdm.tqdm(pool.imap(save_masked_image, paths), total=len(paths)): pass if __name__ == '__main__': marked_images = np.load('input_/marked_images.npy', allow_pickle=True) path = DataManager.get_path() + 'train_images/' print( f"[Old] images with marks in {path} ({len(marked_images)} of them) " "will be overwritten by the newly generated images (with marks removed)\n" "Are you sure you want to continue? (y/n)") c = input() if c.lower() == 'y' or c.lower() == 'yes': if Config.input.tiff_format: print("Requires images to be .jpeg format") print("Script cancelled") else: main(path, marked_images) else: print("Script cancelled")
from keras.callbacks import ModelCheckpoint, EarlyStopping assert jieba and np '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''' setting option ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' n_batch = 1024 n_epoch = 100 max_word_len = 14 word_dim = 300 adam = keras.optimizers.Adam(clipnorm=0.0001) adamax = keras.optimizers.Adamax(clipnorm=0.0001) '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''' create model ''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' dm = DataManager() voc = Vocabulary() dm.word_dim = word_dim dm.word_len = max_word_len voc.word2vec('data/w2v_model') print("reading data...", end='') dm.read_train_data('data/training_data/1_train.txt', 'train1') dm.read_train_data('data/training_data/2_train.txt', 'train2') dm.read_train_data('data/training_data/3_train.txt', 'train3') dm.read_train_data('data/training_data/4_train.txt', 'train4') dm.read_train_data('data/training_data/5_train.txt', 'train5') #dm.read_test_data('data/testing_data.csv','test_question','test_option') print("\rreading data...finish") print(dm.data['train1'][:3])