def main():
    params = {'batch_size': 64}
    modelname = argv[1]
    #Datasets
    partition = load_partition()
    print(len(partition['train']))
    print(len(partition['validation']))
    training_generator = DataGenerator(partition['train'], **params)
    validation_generator = DataGenerator(partition['validation'], **params)

    dm = DataManager()
    dm.load_tokenizer('/mnt/data/b04901058/recsys/token0_Xfull.pk')
    word_index, embedding_matrix = dm.embedding_matrix()
    cnn_model = cnn0(word_index, embedding_matrix)
    cnn_model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

    checkpoint = [
        ModelCheckpoint(
            modelname,  # model filename
            monitor='val_loss',  # quantity to monitor
            verbose=0,  # verbosity - 0 or 1
            save_best_only=True,  # The latest best model will not be overwritten
            mode='auto'),  # The decision to overwrite model is made 
        EarlyStopping(monitor='val_loss', patience=3, verbose=0)
    ]
    cnn_model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            callbacks=checkpoint,
                            verbose=1,
                            use_multiprocessing=True,
                            epochs=12,
                            workers=3)
Ejemplo n.º 2
0
def main():
    """ 
        Main function of test.py
        Arguments:
            modelname: String, name of the model
            datapath: The testing file
            subtask: String, "A" or "B" or "C"
        Outputs:
            subtask + [subtask]/result/[modelname]/res.pred
    """
    modelname = args.modelname
    datapath = args.datapath
    subtask = args.subtask
    dm = DataManager(subtask)
    dm.load_tokenizer(
        os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"),
        os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl"))
    dm.add_data("test", datapath)
    dm.to_sequence(40, 40)
    (test_Q, test_C), qidlist = dm.get_data("test")
    print("test_Q", test_Q[0:2])
    print("test_C", test_C[0:2])
    print("qidlist", qidlist[0:2])
    model = load_model(
        os.path.join("subtask" + subtask, "models", modelname, "model.h5"))
    result = model.predict([test_Q, test_C], batch_size=128, verbose=1)
    print("result", result[0:2])
    if subtask == "A":
        outputA(qidlist, result, modelname)
    elif subtask == "B":
        outputB(qidlist, result, modelname)
    elif subtask == "C":
        outputC(qidlist, result, modelname)
Ejemplo n.º 3
0
def main():
    path_pfx = ''
    max_len = 37

    dm = DataManager()
    dm.add_data('test', os.path.join(sys.argv[1]), False, True)
    print(len(dm.data['test'][0]))
    dm.preprocessing()
    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl'))
    dm.to_sequence(max_len, use_pretrain=True)
    result = predict(dm.data['test'][0], path_pfx)
    write(sys.argv[2], result)
    print('finished')
Ejemplo n.º 4
0
    class input:

        path = DataManager.get_path()

        if os.path.exists(path + 'test_images/'):
            path += 'test_images/'
        else:
            path += 'train_images/'
        print("Path to images: ", path)

        if glob.glob(path+'*.jpeg'):
            tiff_format = False
        else:
            tiff_format = True
        print("Image tiff-format: ", tiff_format)


        tiff_level = 1                   # only if tiff_format is Ture
        resize_ratio = 1                 # 1 (N x N) or 2 (-> N//2 x N//2)
        input_shape = (1280, 1280, 3)
        patch_size = 256
        sample_size = 25
        preprocess_mode = 'float'
        objective = 'bce'
        label_smoothing = 0.0            # only if objective is 'cce'
Ejemplo n.º 5
0
def main():
    voc_size = None
    max_len = 39
    path_pfx = ''
    dm = DataManager()
    dm.add_data('train', sys.argv[1])
    #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False)
    #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True)
    dm.preprocessing()

    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy'))
    dm.to_sequence(max_len, use_pretrain=True)
    #dm.to_bow()

    print(max_len)

    #emb_mat =  dm.get_embedding_matrix()
    emb_mat = None

    train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
Ejemplo n.º 6
0
def main(argv):
    filename = argv[1]
    output_path = argv[2]
    output_path = output_path.replace('\r', '')
    output_path = output_path.replace('\r\n', '')
    dm = DataManager()
    dm.add_data('test_data', filename, False)
    dm.load_tokenizer('./model/token_25k.pk')
    dm.to_sequence(40)

    model = load_model('./model/00017-0.82720.h5')
    model.summary()

    val_proba = model.predict(dm.data['test_data'])
    val_classes = [1 if value > 0.5 else 0 for value in val_proba]

    out = pd.DataFrame(val_classes, columns=['label'])
    out.to_csv(output_path, index_label='id')
Ejemplo n.º 7
0
def argument_parser(L):
    token = L[1]
    dm = DataManager()
    dm.add_data('data/data.csv')
    X = dm.get_data('data')
    Y = dm.get_data('label')
    data = X[0]
    label = Y[0]
    logpath = os.path.join('log')
    if not os.path.exists(logpath):
        os.makedirs(logpath)
    if token == 'LinR':
        MSE, MAE = train(data, label, token)
        with open('log/LinR.csv', 'w') as f:
            f.write('MSE,MAE\n')
            f.write('{},{}\n'.format(MSE, MAE))
    else:
        bin_size = int(L[2])
        acc, pre, rec, f_score = train(data, label, token, bin_size=bin_size)
        with open('log/' + token + '-bins-' + str(bin_size) + '.csv', 'w') as f:
            f.write('accuracy,precision,recall,f-score\n')
            f.write('{},{},{},{}\n'.format(acc, pre, rec, f_score))
Ejemplo n.º 8
0
 def get_similar(self):
     return dm.get_model('model')
Ejemplo n.º 9
0
 def get_ref_files(self):
     return dm.get_csv('indices', index=0), dm.get_csv('titles')
fold = Config.train.fold
batch_size = Config.train.batch_size
epochs = Config.train.epochs

lr_max = Config.train.learning_rate.max
lr_min = Config.train.learning_rate.min
lr_decay_epochs = Config.train.learning_rate.decay_epochs
lr_warmup_epochs = Config.train.learning_rate.warmup_epochs
lr_power = Config.train.learning_rate.power

units = Config.model.units
dropout = Config.model.dropout
activation = Config.model.activation

train_data, valid_data = DataManager.get_train_data(split=True,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    add_image_size_info=True)

lr_steps_per_epoch = math.ceil(len(train_data) / Config.train.batch_size)

train_dataset = get_dataset(
    dataframe=train_data,
    input_path=input_path,
    batch_size=batch_size,
    training=True,
    augment='heavy',
    tta=1,
    input_size=input_shape,
    objective=objective,
    buffer_size=8192,
    cache=False,
Ejemplo n.º 11
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

#####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'test':
        dm.add_data('test_data', test_path, False)
    else:
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.action == 'token':
        dm.tokenize()

    else:
        # read exist tokenizer
        dm.load_tokenizer(args.token)
    '''else:
        # create tokenizer on new data
        dm.tokenize()'''

    dm.save_tokenizer(args.token)

    # convert to sequences
    if args.action != 'token':
        dm.to_sequence(args.max_length)

    # initial model
    if args.action != 'token':
        print('initial model...')
        model = simpleRNN(args)
        print(model.summary())
        if args.load_model is not None:
            if args.action == 'train':
                print('Warning : load a exist model and keep training')
            path = os.path.join(load_path, 'model.h5')
            if os.path.exists(path):
                print('load model from %s' % path)
                model.load_weights(path)
            else:
                raise ValueError("Can't find the file %s" % path)
        elif args.action == 'test':
            print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

# testing
    elif args.action == 'test':
        X = dm.get_data('test_data')[0]
        predict = model.predict(X)
        result = [['id', 'label']]
        for i in range(len(predict)):
            a = [i]
            if predict[i][0] > 0.5:
                a.append(1)
            else:
                a.append(0)
            #a.append(predict[i][0])  #test
            #a.append(predict[i])
            result.append(a)
            i += 1
        cout = csv.writer(open(args.result_path, 'w'))
        cout.writerows(result)
        #implement after ensure output format


# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times

        #for i in range(10):
        # label the semi-data
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                          args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        #print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
        # train
        history = model.fit(semi_X,
                            semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=20,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        if os.path.exists(save_path):
            print('load model from %s' % save_path)
            model.load_weights(save_path)
        else:
            raise ValueError("Can't find the file %s" % path)
Ejemplo n.º 12
0
def main():
    dm = DataManager()
    dm.add_data('train_data', train_path, True)
    dm.add_data('semi_data', semi_path, False)

    print('Get Tokenizer...')
    dm.load_tokenizer('./token/token.pk')

    embedding_mat = dm.to_sequence(40, action)

    print('Initial model...')
    if action == 'train':
        model = RNN(embedding_mat)
        print(model.summary())
    elif action == 'semi':
        model = load_model('./model/model1.hdf5')
        print(model.summary())

    if action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=30,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        model.fit(X,
                  Y,
                  validation_data=(X_val, Y_val),
                  epochs=80,
                  batch_size=512,
                  callbacks=[checkpoint, earlystopping])

    elif action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        for i in range(10):
            semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            model.fit(semi_X,
                      semi_Y,
                      validation_data=(X_val, Y_val),
                      epochs=2,
                      batch_size=512,
                      callbacks=[checkpoint, earlystopping])
            print('load model from')
            model = load_model('./model/model_semi.hdf5')
Ejemplo n.º 13
0
def main():
	# limit gpu memory usage
	train_path = argv[1]
	semi_path = argv[2]

	#K.set_session(get_session(gpu_fraction))

	#####read data#####

	dm = DataManager()
	print ('Loading data...')
	if action == 'train':
		dm.add_data('train_data', train_path, True)
		#dm.add_data('semi_data', semi_path, False)
	elif action == 'semi':
		dm.add_data('train_data', train_path, True)
		dm.add_data('semi_data', semi_path, False)
	else:
		raise Exception ('Implement your testing parser')

	# prepare tokenizer
	print ('get Tokenizer...')
	if not os.path.exists(tokenizer_save_path):
		dm.tokenize(20000)
		dm.save_tokenizer(tokenizer_save_path)
	else:
		dm.load_tokenizer(tokenizer_save_path)

	
	# Word2Vec
	print ('get Word2Vec...')
	data_dic = dm.get_data()
	tokenizer = dm.get_tokenizer()
	#vocab_size = len(tokenizer.word_index)+1
	#data_list = data_dic['train_data'][2]+data_dic['semi_data'][1]
	#data_list = data_dic['train_data']
	#w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16)
	#w2v_model.save(word2vec_save_path)
	#w2v_model = Word2Vec.load(word2vec_save_path)
	w2v_model=pk.load(open('emb.pkl','rb'))

	# convert to sequences
	dm.to_sequence(max_length)
	#dm.to_bow()

	# initial model
	print ('initial model...')
	model = simpleRNN()    
	print (model.summary())
	labelnum = [] 

	# training
	if action == 'train':
		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' )
		history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping])
	# semi-supervised training
	elif action == 'semi':

		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		semi_all_X = dm.get_data()['semi_data'][0]
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)
		semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer)

		X = np.array(X)
		X_val = np.array(X_val)
		semi_all_X = np.array(semi_all_X)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max')
		# repeat 10 times
		for i in range(10):
			# label the semi-data
			semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
			semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold)
			labelnum.append(semi_X.shape)
			semi_X = np.concatenate((semi_X, X),axis=0)
			semi_Y = np.concatenate((semi_Y, Y),axis=0)
			print ('-- iteration %d  semi_data size: %d' %(i+1,len(semi_X)))
			# train
			history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] )

			if os.path.exists(model_save_path):
				print ('load model from %s' % model_save_path)
				model.load_model(model_save_path)
			else:
				raise ValueError("Can't find the file %s" %path)
	
	'''
Ejemplo n.º 14
0
def new_process_xy(tokenpath,path2x,path2y):
    dm = DataManager()
    dm.add_data('seed', '0samples.csv')
    dm.add_data('truth', '0samples.csv')
    dm.tokenize(230000) #vocab size
    dm.save_tokenizer(tokenpath)
    dm.to_sequence(1) #max length
    dm.save_sequence(path2x)
    dm.tosave_label(path2y)
Ejemplo n.º 15
0
import sys 
import keras
import _pickle as pk
import numpy as np
from keras.models import Model, Sequential, load_model

from util import DataManager

# argv settings
test_path = sys.argv[1]
output_path = sys.argv[2]
mode = sys.argv[3]

# load data
dm = DataManager()
dm.add_data('test_data',test_path,False)


if mode=='private':
  # tokenizer
  dm.load_tokenizer('./token/token.pk')
  # load model
  model = load_model('./model/model1.hdf5')
elif mode=='public':
  # tokenizer
  dm.load_tokenizer('./token/token_filter.pk')
  # load model
  model = load_model('./model/model2.hdf5')

dm.to_sequence(40,'test')
test_all_x = dm.get_data('test_data')
Ejemplo n.º 16
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir,args.load_model)

           #####read data#####
    dm = DataManager()
    print ('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'train_corpus':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        raise Exception ('Implement your testing parser')


    # prepare tokenizer
    print ('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists('./model/token_25k.pk'):
        dm.save_tokenizer('./model/token_25k.pk')

    embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim)
    dm.to_sequence(args.max_length)
        # initial model
    print ('initial model...')
    model = simpleRNN(args,embedding_w)
    model.summary()

    if args.load_model is not None:
        if args.action == 'train':
            print ('Warning : load a exist model and keep training')
        path = os.path.join(load_path,'model.h5')
        if os.path.exists(path):
            print ('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" %path)
    elif args.action == 'test':
        print ('Warning : testing without loading any model')

        # training
    if args.action == 'train_corpus':
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max')

        checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        history = model.fit(X, Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            verbose=1,
                            shuffle= True,
                            callbacks=[checkpoint, earlystopping] )
        # plot_figure(history)
            # semi-supervised training
    elif args.action == 'semi':

        earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max')


        checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        # repeat 10 times
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        [semi_all_X] = dm.get_data('semi_data')
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        dm.clean_data()
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data',test_path, False)
        dm.to_sequence(args.max_length)
        semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        print ('-- semi_data size: %d' %(len(semi_X)))

        model = simpleRNN(args,embedding_w)
        # train
        history = model.fit(semi_X, semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=40,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping] )

        plot_figure(history)
Ejemplo n.º 17
0
import torch
from torch.utils.data import DataLoader
from util import DataManager, AutoEncoder, AEDataset
import argparse

parser = argparse.ArgumentParser(description='DLCV HW5')
#parser.add_argument('-p','--problem', dest='problem',type=int,required=True)
args = parser.parse_args()

TENSORBOARD_DIR = './runs/train'

dm = DataManager(tensorboard_dir=TENSORBOARD_DIR)

EPOCH = 50
BATCH_SIZE = 128
LABEL_DIM = 11
DROPOUT = 0.5
LEARNING_RATE = 1E-3
PRETRAIN = True
OUTPUT_PATH = './model/pretrained.pt'
OUTPUT_CHARACTER = 'data/character.txt'

train_path = ['./data/trainx.npy', './data/trainy.npy']
val_path = ['./data/valx.npy', './data/valy.npy']
val_data = dm.readfile('./dataset/val',
                       './dataset/val_id.txt',
                       save_path=val_path)
train_data = dm.readfile('./dataset/train/',
                         './dataset/train_id.txt',
                         save_path=train_path)
#dm.character.save(OUTPUT_CHARACTER)
import jieba
jieba.dt.cache_file = 'jieva.cache.new'
import numpy as np
from util import DataManager, Vocabulary

max_word_len = 14
word_dim_list = [50, 100, 150, 200, 250, 300, 350, 400]
test = np.zeros((5060, 6))

for word_dim in word_dim_list:
    print('word dim=', word_dim)
    dm = DataManager()
    voc = Vocabulary()
    dm.word_dim = word_dim
    dm.word_len = max_word_len

    voc.word2vec('data/w2v_model/w2v_model_{}'.format(word_dim))
    print("reading data...", end='')
    dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option')
    print("\rreading data...finish")

    print("construct data...")
    dm.construct_data_seq2seq('test_question', voc, 'data/test_question.npy')
    dm.construct_data_seq2seq('test_option',
                              voc,
                              'data/test_option.npy',
                              multi_seq=True)
    print("construct data...finish")
    print('test_question_seq.shape: ' + str(dm.data['test_question'].shape))
    print('test_option.shape: ' + str(dm.data['test_option'].shape))
Ejemplo n.º 19
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    #save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)
        print('load_path:', load_path)

#####read data#####
    dm = DataManager()
    w2v_path = os.path.join(args.save_dir, 'word2vec')
    print(w2v_path)

    if args.action == 'train':
        print('Loading data...')
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', args.test_path)

        test_data = dm.get_test_data('test_data')
        train_data = dm.get_data('train_data')
        semi_data = dm.get_data('semi_data')

        all_text = np.concatenate((train_data[0], semi_data[0], test_data),
                                  axis=0)
        print('Number of all_text:', all_text.shape[0])
        #print('Text sample:',all_text[0])

        print('Converting texts to words sequence...')
        text2word = []

        with_filter = 0
        if with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(
                        text,
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                        lower=True,
                        split=" "))
        if not with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(text,
                                          filters='',
                                          lower=True,
                                          split=" "))

        print('Word sequence sample:', text2word[0])

        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
        else:
            print('Building word2vec model...')
            word_vec = gensim.models.Word2Vec(text2word,
                                              size=128,
                                              min_count=15)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            if not os.path.isdir(save_path):
                os.makedirs(save_path)
            if not os.path.exists(os.path.join(save_path, 'word2vec')):
                word_vec.save((os.path.join(save_path, 'word2vec')))

        print('Coverting train_data to vector...')
        index_data = []
        i = 0
        for line in train_data[0]:
            index_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

        for i in range(len(word_vec.wv.vocab)):
            embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        index_data = pad_sequences(index_data, args.max_length)
    else:
        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

            for i in range(len(word_vec.wv.vocab)):
                embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        else:
            print('Can not load w2v model, please training w2v model first!')

    #print ('get Tokenizer...')
    #if args.load_model is not None:
    #    # read exist tokenizer
    #    dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    #else:
    #    # create tokenizer on new data
    #    dm.tokenize(args.vocab_size)
    #
    #if not os.path.isdir(save_path):
    #    os.makedirs(save_path)
    #if not os.path.exists(os.path.join(save_path,'token.pk')):
    #    dm.save_tokenizer(os.path.join(save_path,'token.pk'))
#
# mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count')
# mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count')

# convert to sequences
#dm.to_sequence(args.max_length)

# initial model
    print('initial model...')
    #model = bow_model(args,mat_train_data)
    model = simpleRNN(args, embedding_matrix)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        #path = os.path.join(load_path,'model.h5')
        if os.path.exists(load_path):
            print('load model from %s' % load_path)
            model.load_weights(load_path)
        else:
            raise ValueError("Can't find the file %s" % load_path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

# training
    if args.action == 'train':
        #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        X, X_val, Y, Y_val = train_test_split(index_data,
                                              train_data[1],
                                              test_size=0.33,
                                              random_state=42)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        print(history.history.keys())
        print('Val_acc:', history.history['val_acc'])
        print('Train_acc:', history.history['acc'])

# testing
    elif args.action == 'test':
        dm.add_test_data('test_data', args.test_path)
        test_data = dm.get_test_data('test_data')

        # Covert to vector
        index_test_data = []
        i = 0
        for line in test_data:
            index_test_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_test_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        index_test_data = pad_sequences(index_test_data, args.max_length)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        csv_path = os.path.join(args.result_path, 'prediction.csv')

        print("Predicting testing data...")
        Y_pred = model.predict(index_test_data)
        Y_pred = np.round(Y_pred)
        print('Saving result csv to', csv_path)
        with open(csv_path, 'w') as f:
            f.write('id,label\n')
            for i, v in enumerate(Y_pred):
                f.write('%d,%d\n' % (i, v))

# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')

        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(5):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=2048,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=256,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Ejemplo n.º 20
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        dm.add_data('test_data', test_path, False)
        # raise Exception ('Implement your testing parser')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)
    # dm.to_bow()

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        plot(history, args.model)
        # plot_model(model, to_file='./img/structure.png')

    # testing
    elif args.action == 'test':
        X = dm.get_data('test_data')
        print('Predict testing data...')
        result = model.predict(X)
        print('Save result...')
        saveResult(result, args.result_path)
        # raise Exception ('Implement your testing function')

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Ejemplo n.º 21
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = args.save_dir
    if args.load_model is not None:
        load_path = args.save_dir

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', test_path)
    else:
        dm.add_test_data('test_data', test_path)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

    # testing
    elif args.action == 'test':
        print(model.summary())
        [test_x] = dm.get_data('test_data')
        classes = model.predict(test_x, batch_size=32)
        with open(args.output_path, "w", encoding='utf-8') as f:
            spamwriter = csv.writer(f, delimiter=',')
            spamwriter.writerow(['id', 'label'])
            for i in range(len(classes)):
                if classes[i][0] < 0.5:
                    result = 0
                else:
                    result = 1
                spamwriter.writerow([str(i), str(result)])

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        [test_x] = dm.get_data('test_data')
        semi_all_X = np.concatenate((semi_all_X, test_x), axis=0)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(16):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Ejemplo n.º 22
0
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
assert jieba and np
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
'''''' '''''' '''''' '''       setting option                           '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
n_batch = 1024
n_epoch = 30
max_word_len = 13
word_dim = 300
adam = keras.optimizers.Adam(clipnorm=0.0001)
adamax = keras.optimizers.Adamax(clipnorm=0.0001)
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
'''''' '''''' '''''' '''       create model                             '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
dm = DataManager()
voc = Vocabulary()
dm.word_dim = word_dim
dm.word_len = max_word_len

voc.word2vec('data/w2v_model')

print("reading data...", end='')
dm.read_train_data('data/training_data/1_train.txt', 'train1')
dm.read_train_data('data/training_data/2_train.txt', 'train2')
dm.read_train_data('data/training_data/3_train.txt', 'train3')
dm.read_train_data('data/training_data/4_train.txt', 'train4')
dm.read_train_data('data/training_data/5_train.txt', 'train5')
dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option')
print("\rreading data...finish")
print(dm.data['train1'][:3])
Ejemplo n.º 23
0
'''''' '''''' '''''' '''       setting option                           '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
parser = argparse.ArgumentParser(description='Handle input model.')
parser.add_argument('--model', dest='model', type=str, required=True)
args = parser.parse_args()
continue_file = args.model
n_batch = 4096
max_word_len = 14
word_dim = 300

adam = keras.optimizers.Adam(clipnorm=0.0001)
adamax = keras.optimizers.Adamax(clipnorm=0.0001)
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
'''''' '''''' '''''' '''       create model                             '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
dm = DataManager()
voc = Vocabulary()
dm.word_dim = word_dim
dm.word_len = max_word_len

voc.word2vec('data/w2v_model')

print("reading data...", end='')
dm.read_train_data('data/training_data/1_train.txt', 'train1')
dm.read_test_data('data/testing_data.csv', 'test_question', 'test_option')
print("\rreading data...finish")
print(dm.data['test_question'][:6])

print("construct data...")
dm.construct_data_seq2seq('train1', voc, 'data/train1.npy')
dm.construct_data_seq2seq('test_question', voc, 'data/test_question.npy')
    img_masked = remove_penmarks(img_path)
    img_masked = Image.fromarray(img_masked)
    #img_masked.save(img_path, subsampling=0, quality=100)


def main(path, marked_images):
    paths = path + marked_images + '.jpeg'
    with multiprocessing.Pool() as pool:
        for c in tqdm.tqdm(pool.imap(save_masked_image, paths),
                           total=len(paths)):
            pass


if __name__ == '__main__':
    marked_images = np.load('input_/marked_images.npy', allow_pickle=True)
    path = DataManager.get_path() + 'train_images/'
    print(
        f"[Old] images with marks in {path} ({len(marked_images)} of them) "
        "will be overwritten by the newly generated images (with marks removed)\n"
        "Are you sure you want to continue? (y/n)")
    c = input()

    if c.lower() == 'y' or c.lower() == 'yes':
        if Config.input.tiff_format:
            print("Requires images to be .jpeg format")
            print("Script cancelled")
        else:
            main(path, marked_images)
    else:
        print("Script cancelled")
Ejemplo n.º 25
0
from keras.callbacks import ModelCheckpoint, EarlyStopping
assert jieba and np
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
'''''' '''''' '''''' '''       setting option                           '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
n_batch = 1024
n_epoch = 100
max_word_len = 14
word_dim = 300

adam = keras.optimizers.Adam(clipnorm=0.0001)
adamax = keras.optimizers.Adamax(clipnorm=0.0001)
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
'''''' '''''' '''''' '''       create model                             '''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''
dm = DataManager()
voc = Vocabulary()
dm.word_dim = word_dim
dm.word_len = max_word_len

voc.word2vec('data/w2v_model')

print("reading data...", end='')
dm.read_train_data('data/training_data/1_train.txt', 'train1')
dm.read_train_data('data/training_data/2_train.txt', 'train2')
dm.read_train_data('data/training_data/3_train.txt', 'train3')
dm.read_train_data('data/training_data/4_train.txt', 'train4')
dm.read_train_data('data/training_data/5_train.txt', 'train5')
#dm.read_test_data('data/testing_data.csv','test_question','test_option')
print("\rreading data...finish")
print(dm.data['train1'][:3])