def main(): """ Main function of test.py Arguments: modelname: String, name of the model datapath: The testing file subtask: String, "A" or "B" or "C" Outputs: subtask + [subtask]/result/[modelname]/res.pred """ modelname = args.modelname datapath = args.datapath subtask = args.subtask dm = DataManager(subtask) dm.load_tokenizer( os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"), os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl")) dm.add_data("test", datapath) dm.to_sequence(40, 40) (test_Q, test_C), qidlist = dm.get_data("test") print("test_Q", test_Q[0:2]) print("test_C", test_C[0:2]) print("qidlist", qidlist[0:2]) model = load_model( os.path.join("subtask" + subtask, "models", modelname, "model.h5")) result = model.predict([test_Q, test_C], batch_size=128, verbose=1) print("result", result[0:2]) if subtask == "A": outputA(qidlist, result, modelname) elif subtask == "B": outputB(qidlist, result, modelname) elif subtask == "C": outputC(qidlist, result, modelname)
def new_process_xy(tokenpath,path2x,path2y): dm = DataManager() dm.add_data('seed', '0samples.csv') dm.add_data('truth', '0samples.csv') dm.tokenize(230000) #vocab size dm.save_tokenizer(tokenpath) dm.to_sequence(1) #max length dm.save_sequence(path2x) dm.tosave_label(path2y)
def main(): path_pfx = '' max_len = 37 dm = DataManager() dm.add_data('test', os.path.join(sys.argv[1]), False, True) print(len(dm.data['test'][0])) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl')) dm.to_sequence(max_len, use_pretrain=True) result = predict(dm.data['test'][0], path_pfx) write(sys.argv[2], result) print('finished')
def main(argv): filename = argv[1] output_path = argv[2] output_path = output_path.replace('\r', '') output_path = output_path.replace('\r\n', '') dm = DataManager() dm.add_data('test_data', filename, False) dm.load_tokenizer('./model/token_25k.pk') dm.to_sequence(40) model = load_model('./model/00017-0.82720.h5') model.summary() val_proba = model.predict(dm.data['test_data']) val_classes = [1 if value > 0.5 else 0 for value in val_proba] out = pd.DataFrame(val_classes, columns=['label']) out.to_csv(output_path, index_label='id')
def main(): voc_size = None max_len = 39 path_pfx = '' dm = DataManager() dm.add_data('train', sys.argv[1]) #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False) #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True) dm.preprocessing() dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec')) #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy')) dm.to_sequence(max_len, use_pretrain=True) #dm.to_bow() print(max_len) #emb_mat = dm.get_embedding_matrix() emb_mat = None train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir,args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir,args.load_model) #####read data##### dm = DataManager() print ('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) elif args.action == 'train_corpus': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path,'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists('./model/token_25k.pk'): dm.save_tokenizer('./model/token_25k.pk') embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim) dm.to_sequence(args.max_length) # initial model print ('initial model...') model = simpleRNN(args,embedding_w) model.summary() if args.load_model is not None: if args.action == 'train': print ('Warning : load a exist model and keep training') path = os.path.join(load_path,'model.h5') if os.path.exists(path): print ('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" %path) elif args.action == 'test': print ('Warning : testing without loading any model') # training if args.action == 'train_corpus': (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, verbose=1, shuffle= True, callbacks=[checkpoint, earlystopping] ) # plot_figure(history) # semi-supervised training elif args.action == 'semi': earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_acc', mode='max' ) # repeat 10 times (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) dm.clean_data() dm.add_data('train_data', train_path, True) dm.add_data('test_data',test_path, False) dm.to_sequence(args.max_length) semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print ('-- semi_data size: %d' %(len(semi_X))) model = simpleRNN(args,embedding_w) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=40, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping] ) plot_figure(history)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'test': dm.add_data('test_data', test_path, False) else: dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) # prepare tokenizer print('get Tokenizer...') if args.action == 'token': dm.tokenize() else: # read exist tokenizer dm.load_tokenizer(args.token) '''else: # create tokenizer on new data dm.tokenize()''' dm.save_tokenizer(args.token) # convert to sequences if args.action != 'token': dm.to_sequence(args.max_length) # initial model if args.action != 'token': print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': X = dm.get_data('test_data')[0] predict = model.predict(X) result = [['id', 'label']] for i in range(len(predict)): a = [i] if predict[i][0] > 0.5: a.append(1) else: a.append(0) #a.append(predict[i][0]) #test #a.append(predict[i]) result.append(a) i += 1 cout = csv.writer(open(args.result_path, 'w')) cout.writerows(result) #implement after ensure output format # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=11, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times #for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) #print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=20, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage train_path = argv[1] semi_path = argv[2] #K.set_session(get_session(gpu_fraction)) #####read data##### dm = DataManager() print ('Loading data...') if action == 'train': dm.add_data('train_data', train_path, True) #dm.add_data('semi_data', semi_path, False) elif action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: raise Exception ('Implement your testing parser') # prepare tokenizer print ('get Tokenizer...') if not os.path.exists(tokenizer_save_path): dm.tokenize(20000) dm.save_tokenizer(tokenizer_save_path) else: dm.load_tokenizer(tokenizer_save_path) # Word2Vec print ('get Word2Vec...') data_dic = dm.get_data() tokenizer = dm.get_tokenizer() #vocab_size = len(tokenizer.word_index)+1 #data_list = data_dic['train_data'][2]+data_dic['semi_data'][1] #data_list = data_dic['train_data'] #w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16) #w2v_model.save(word2vec_save_path) #w2v_model = Word2Vec.load(word2vec_save_path) w2v_model=pk.load(open('emb.pkl','rb')) # convert to sequences dm.to_sequence(max_length) #dm.to_bow() # initial model print ('initial model...') model = simpleRNN() print (model.summary()) labelnum = [] # training if action == 'train': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' ) history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping]) # semi-supervised training elif action == 'semi': (X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio) semi_all_X = dm.get_data()['semi_data'][0] X = embedding_vector(X, w2v_model, tokenizer) X_val = embedding_vector(X_val, w2v_model, tokenizer) semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer) X = np.array(X) X_val = np.array(X_val) semi_all_X = np.array(semi_all_X) earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold) labelnum.append(semi_X.shape) semi_X = np.concatenate((semi_X, X),axis=0) semi_Y = np.concatenate((semi_Y, Y),axis=0) print ('-- iteration %d semi_data size: %d' %(i+1,len(semi_X))) # train history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] ) if os.path.exists(model_save_path): print ('load model from %s' % model_save_path) model.load_model(model_save_path) else: raise ValueError("Can't find the file %s" %path) '''
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = os.path.join(args.save_dir, args.model) if args.load_model is not None: load_path = os.path.join(args.save_dir, args.load_model) #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) else: dm.add_data('test_data', test_path, False) # raise Exception ('Implement your testing parser') # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # dm.to_bow() # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: if args.action == 'train': print('Warning : load a exist model and keep training') path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) elif args.action == 'test': print('Warning : testing without loading any model') # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) plot(history, args.model) # plot_model(model, to_file='./img/structure.png') # testing elif args.action == 'test': X = dm.get_data('test_data') print('Predict testing data...') result = model.predict(X) print('Save result...') saveResult(result, args.result_path) # raise Exception ('Implement your testing function') # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(10): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): # limit gpu memory usage def get_session(gpu_fraction): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) K.set_session(get_session(args.gpu_fraction)) save_path = args.save_dir if args.load_model is not None: load_path = args.save_dir #####read data##### dm = DataManager() print('Loading data...') if args.action == 'train': dm.add_data('train_data', train_path, True) elif args.action == 'semi': dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) dm.add_test_data('test_data', test_path) else: dm.add_test_data('test_data', test_path) # prepare tokenizer print('get Tokenizer...') if args.load_model is not None: # read exist tokenizer dm.load_tokenizer(os.path.join(load_path, 'token.pk')) else: # create tokenizer on new data dm.tokenize(args.vocab_size) if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(save_path, 'token.pk')): dm.save_tokenizer(os.path.join(save_path, 'token.pk')) # convert to sequences dm.to_sequence(args.max_length) # initial model print('initial model...') model = simpleRNN(args) print(model.summary()) if args.load_model is not None: path = os.path.join(load_path, 'model.h5') if os.path.exists(path): print('load model from %s' % path) model.load_weights(path) else: raise ValueError("Can't find the file %s" % path) # training if args.action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=args.nb_epoch, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) # testing elif args.action == 'test': print(model.summary()) [test_x] = dm.get_data('test_data') classes = model.predict(test_x, batch_size=32) with open(args.output_path, "w", encoding='utf-8') as f: spamwriter = csv.writer(f, delimiter=',') spamwriter.writerow(['id', 'label']) for i in range(len(classes)): if classes[i][0] < 0.5: result = 0 else: result = 1 spamwriter.writerow([str(i), str(result)]) # semi-supervised training elif args.action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio) [semi_all_X] = dm.get_data('semi_data') [test_x] = dm.get_data('test_data') semi_all_X = np.concatenate((semi_all_X, test_x), axis=0) earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') save_path = os.path.join(save_path, 'model.h5') checkpoint = ModelCheckpoint(filepath=save_path, verbose=1, save_best_only=True, save_weights_only=True, monitor='val_acc', mode='max') # repeat 10 times for i in range(16): # label the semi-data semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, args.threshold, args.loss_function) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) # train history = model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=args.batch_size, callbacks=[checkpoint, earlystopping]) if os.path.exists(save_path): print('load model from %s' % save_path) model.load_weights(save_path) else: raise ValueError("Can't find the file %s" % path)
def main(): dm = DataManager() dm.add_data('train_data', train_path, True) dm.add_data('semi_data', semi_path, False) print('Get Tokenizer...') dm.load_tokenizer('./token/token.pk') embedding_mat = dm.to_sequence(40, action) print('Initial model...') if action == 'train': model = RNN(embedding_mat) print(model.summary()) elif action == 'semi': model = load_model('./model/model1.hdf5') print(model.summary()) if action == 'train': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) earlystopping = EarlyStopping(monitor='val_acc', patience=30, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') model.fit(X, Y, validation_data=(X_val, Y_val), epochs=80, batch_size=512, callbacks=[checkpoint, earlystopping]) elif action == 'semi': (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2) [semi_all_X] = dm.get_data('semi_data') earlystopping = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') for i in range(10): semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1) semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1) semi_X = np.concatenate((semi_X, X)) semi_Y = np.concatenate((semi_Y, Y)) print('-- iteration %d semi_data size: %d' % (i + 1, len(semi_X))) model.fit(semi_X, semi_Y, validation_data=(X_val, Y_val), epochs=2, batch_size=512, callbacks=[checkpoint, earlystopping]) print('load model from') model = load_model('./model/model_semi.hdf5')
# load data dm = DataManager() dm.add_data('test_data',test_path,False) if mode=='private': # tokenizer dm.load_tokenizer('./token/token.pk') # load model model = load_model('./model/model1.hdf5') elif mode=='public': # tokenizer dm.load_tokenizer('./token/token_filter.pk') # load model model = load_model('./model/model2.hdf5') dm.to_sequence(40,'test') test_all_x = dm.get_data('test_data') print(model.summary()) predict = model.predict(test_all_x,batch_size = 1024, verbose=1) predict[predict <= 0.5] = 0 predict[predict > 0.5] = 1 f = open(output_path,'w') f.write('id' + ',' + 'label\n') for i in range(len(predict)): f.write(str(i) + ',' + str(int(predict[i])) + '\n') f.close()