Beispiel #1
0
def main():
    """ 
        Main function of test.py
        Arguments:
            modelname: String, name of the model
            datapath: The testing file
            subtask: String, "A" or "B" or "C"
        Outputs:
            subtask + [subtask]/result/[modelname]/res.pred
    """
    modelname = args.modelname
    datapath = args.datapath
    subtask = args.subtask
    dm = DataManager(subtask)
    dm.load_tokenizer(
        os.path.join("subtask" + subtask, "models", modelname, "word2idx.pkl"),
        os.path.join("subtask" + subtask, "models", modelname, "idx2word.pkl"))
    dm.add_data("test", datapath)
    dm.to_sequence(40, 40)
    (test_Q, test_C), qidlist = dm.get_data("test")
    print("test_Q", test_Q[0:2])
    print("test_C", test_C[0:2])
    print("qidlist", qidlist[0:2])
    model = load_model(
        os.path.join("subtask" + subtask, "models", modelname, "model.h5"))
    result = model.predict([test_Q, test_C], batch_size=128, verbose=1)
    print("result", result[0:2])
    if subtask == "A":
        outputA(qidlist, result, modelname)
    elif subtask == "B":
        outputB(qidlist, result, modelname)
    elif subtask == "C":
        outputC(qidlist, result, modelname)
Beispiel #2
0
def new_process_xy(tokenpath,path2x,path2y):
    dm = DataManager()
    dm.add_data('seed', '0samples.csv')
    dm.add_data('truth', '0samples.csv')
    dm.tokenize(230000) #vocab size
    dm.save_tokenizer(tokenpath)
    dm.to_sequence(1) #max length
    dm.save_sequence(path2x)
    dm.tosave_label(path2y)
Beispiel #3
0
def main():
    path_pfx = ''
    max_len = 37

    dm = DataManager()
    dm.add_data('test', os.path.join(sys.argv[1]), False, True)
    print(len(dm.data['test'][0]))
    dm.preprocessing()
    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_tokenizer(os.path.join(path_pfx, 'token.pkl'))
    dm.to_sequence(max_len, use_pretrain=True)
    result = predict(dm.data['test'][0], path_pfx)
    write(sys.argv[2], result)
    print('finished')
Beispiel #4
0
def main(argv):
    filename = argv[1]
    output_path = argv[2]
    output_path = output_path.replace('\r', '')
    output_path = output_path.replace('\r\n', '')
    dm = DataManager()
    dm.add_data('test_data', filename, False)
    dm.load_tokenizer('./model/token_25k.pk')
    dm.to_sequence(40)

    model = load_model('./model/00017-0.82720.h5')
    model.summary()

    val_proba = model.predict(dm.data['test_data'])
    val_classes = [1 if value > 0.5 else 0 for value in val_proba]

    out = pd.DataFrame(val_classes, columns=['label'])
    out.to_csv(output_path, index_label='id')
Beispiel #5
0
def main():
    voc_size = None
    max_len = 39
    path_pfx = ''
    dm = DataManager()
    dm.add_data('train', sys.argv[1])
    #dm.add_data('semi', os.path.join(path_pfx, 'training_nolabel.txt'), False)
    #dm.add_data('test', os.path.join(path_pfx, 'testing_data.txt'), False, True)
    dm.preprocessing()

    dm.load_word2vec(os.path.join(path_pfx, 'model/word2vec'))
    #dm.load_embedding_matrix(os.path.join(path_pfx, 'word2vec.wv.vectors.npy'))
    dm.to_sequence(max_len, use_pretrain=True)
    #dm.to_bow()

    print(max_len)

    #emb_mat =  dm.get_embedding_matrix()
    emb_mat = None

    train(dm, voc_size=voc_size, max_len=max_len, emb_mat=emb_mat)
Beispiel #6
0
def argument_parser(L):
    token = L[1]
    dm = DataManager()
    dm.add_data('data/data.csv')
    X = dm.get_data('data')
    Y = dm.get_data('label')
    data = X[0]
    label = Y[0]
    logpath = os.path.join('log')
    if not os.path.exists(logpath):
        os.makedirs(logpath)
    if token == 'LinR':
        MSE, MAE = train(data, label, token)
        with open('log/LinR.csv', 'w') as f:
            f.write('MSE,MAE\n')
            f.write('{},{}\n'.format(MSE, MAE))
    else:
        bin_size = int(L[2])
        acc, pre, rec, f_score = train(data, label, token, bin_size=bin_size)
        with open('log/' + token + '-bins-' + str(bin_size) + '.csv', 'w') as f:
            f.write('accuracy,precision,recall,f-score\n')
            f.write('{},{},{},{}\n'.format(acc, pre, rec, f_score))
Beispiel #7
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir,args.load_model)

           #####read data#####
    dm = DataManager()
    print ('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'train_corpus':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        raise Exception ('Implement your testing parser')


    # prepare tokenizer
    print ('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists('./model/token_25k.pk'):
        dm.save_tokenizer('./model/token_25k.pk')

    embedding_w = dm.get_vec_model('emb_1.npy',args.embedding_dim)
    dm.to_sequence(args.max_length)
        # initial model
    print ('initial model...')
    model = simpleRNN(args,embedding_w)
    model.summary()

    if args.load_model is not None:
        if args.action == 'train':
            print ('Warning : load a exist model and keep training')
        path = os.path.join(load_path,'model.h5')
        if os.path.exists(path):
            print ('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" %path)
    elif args.action == 'test':
        print ('Warning : testing without loading any model')

        # training
    if args.action == 'train_corpus':
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc', patience = 3, verbose=1, mode='max')

        checkpoint = ModelCheckpoint(filepath='./model/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        history = model.fit(X, Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            verbose=1,
                            shuffle= True,
                            callbacks=[checkpoint, earlystopping] )
        # plot_figure(history)
            # semi-supervised training
    elif args.action == 'semi':

        earlystopping = EarlyStopping(monitor='val_acc', patience = 10, verbose=1, mode='max')


        checkpoint = ModelCheckpoint(filepath='./model/semi/'+'{epoch:05d}-{val_acc:.5f}.h5',
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_acc',
                                     mode='max' )

        # repeat 10 times
        (X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        [semi_all_X] = dm.get_data('semi_data')
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        dm.clean_data()
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data',test_path, False)
        dm.to_sequence(args.max_length)
        semi_X, semi_Y = dm.get_semi_data('test_data', semi_pred, args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        print ('-- semi_data size: %d' %(len(semi_X)))

        model = simpleRNN(args,embedding_w)
        # train
        history = model.fit(semi_X, semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=40,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping] )

        plot_figure(history)
Beispiel #8
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

#####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'test':
        dm.add_data('test_data', test_path, False)
    else:
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.action == 'token':
        dm.tokenize()

    else:
        # read exist tokenizer
        dm.load_tokenizer(args.token)
    '''else:
        # create tokenizer on new data
        dm.tokenize()'''

    dm.save_tokenizer(args.token)

    # convert to sequences
    if args.action != 'token':
        dm.to_sequence(args.max_length)

    # initial model
    if args.action != 'token':
        print('initial model...')
        model = simpleRNN(args)
        print(model.summary())
        if args.load_model is not None:
            if args.action == 'train':
                print('Warning : load a exist model and keep training')
            path = os.path.join(load_path, 'model.h5')
            if os.path.exists(path):
                print('load model from %s' % path)
                model.load_weights(path)
            else:
                raise ValueError("Can't find the file %s" % path)
        elif args.action == 'test':
            print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

# testing
    elif args.action == 'test':
        X = dm.get_data('test_data')[0]
        predict = model.predict(X)
        result = [['id', 'label']]
        for i in range(len(predict)):
            a = [i]
            if predict[i][0] > 0.5:
                a.append(1)
            else:
                a.append(0)
            #a.append(predict[i][0])  #test
            #a.append(predict[i])
            result.append(a)
            i += 1
        cout = csv.writer(open(args.result_path, 'w'))
        cout.writerows(result)
        #implement after ensure output format


# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=11,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times

        #for i in range(10):
        # label the semi-data
        semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
        semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                          args.threshold, args.loss_function)
        semi_X = np.concatenate((semi_X, X))
        semi_Y = np.concatenate((semi_Y, Y))
        #print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
        # train
        history = model.fit(semi_X,
                            semi_Y,
                            validation_data=(X_val, Y_val),
                            epochs=20,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        if os.path.exists(save_path):
            print('load model from %s' % save_path)
            model.load_weights(save_path)
        else:
            raise ValueError("Can't find the file %s" % path)
Beispiel #9
0
def main():
	# limit gpu memory usage
	train_path = argv[1]
	semi_path = argv[2]

	#K.set_session(get_session(gpu_fraction))

	#####read data#####

	dm = DataManager()
	print ('Loading data...')
	if action == 'train':
		dm.add_data('train_data', train_path, True)
		#dm.add_data('semi_data', semi_path, False)
	elif action == 'semi':
		dm.add_data('train_data', train_path, True)
		dm.add_data('semi_data', semi_path, False)
	else:
		raise Exception ('Implement your testing parser')

	# prepare tokenizer
	print ('get Tokenizer...')
	if not os.path.exists(tokenizer_save_path):
		dm.tokenize(20000)
		dm.save_tokenizer(tokenizer_save_path)
	else:
		dm.load_tokenizer(tokenizer_save_path)

	
	# Word2Vec
	print ('get Word2Vec...')
	data_dic = dm.get_data()
	tokenizer = dm.get_tokenizer()
	#vocab_size = len(tokenizer.word_index)+1
	#data_list = data_dic['train_data'][2]+data_dic['semi_data'][1]
	#data_list = data_dic['train_data']
	#w2v_model = Word2Vec(data_list, size=256, min_count=5,iter=16,workers=16)
	#w2v_model.save(word2vec_save_path)
	#w2v_model = Word2Vec.load(word2vec_save_path)
	w2v_model=pk.load(open('emb.pkl','rb'))

	# convert to sequences
	dm.to_sequence(max_length)
	#dm.to_bow()

	# initial model
	print ('initial model...')
	model = simpleRNN()    
	print (model.summary())
	labelnum = [] 

	# training
	if action == 'train':
		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 15, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max' )
		history = model.fit(X, Y, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[checkpoint, earlystopping])
	# semi-supervised training
	elif action == 'semi':

		(X,Y),(X_val,Y_val) = dm.split_data('train_data', val_ratio)
		semi_all_X = dm.get_data()['semi_data'][0]
		X = embedding_vector(X, w2v_model, tokenizer)
		X_val = embedding_vector(X_val, w2v_model, tokenizer)
		semi_all_X = embedding_vector(semi_all_X,w2v_model,tokenizer)

		X = np.array(X)
		X_val = np.array(X_val)
		semi_all_X = np.array(semi_all_X)

		earlystopping = EarlyStopping(monitor='val_acc', patience = 5, verbose=1, mode='max')
		checkpoint = ModelCheckpoint(filepath=model_save_path,verbose=1,save_best_only=True,monitor='val_acc',mode='max')
		# repeat 10 times
		for i in range(10):
			# label the semi-data
			semi_pred = model.predict(semi_all_X, batch_size=1024, verbose=True)
			semi_X, semi_Y = getsemidata(semi_all_X,semi_pred,threshold)
			labelnum.append(semi_X.shape)
			semi_X = np.concatenate((semi_X, X),axis=0)
			semi_Y = np.concatenate((semi_Y, Y),axis=0)
			print ('-- iteration %d  semi_data size: %d' %(i+1,len(semi_X)))
			# train
			history = model.fit(semi_X, semi_Y,validation_data=(X_val, Y_val),epochs=2,batch_size=batch_size,callbacks=[checkpoint, earlystopping] )

			if os.path.exists(model_save_path):
				print ('load model from %s' % model_save_path)
				model.load_model(model_save_path)
			else:
				raise ValueError("Can't find the file %s" %path)
	
	'''
Beispiel #10
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    #save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)
        print('load_path:', load_path)

#####read data#####
    dm = DataManager()
    w2v_path = os.path.join(args.save_dir, 'word2vec')
    print(w2v_path)

    if args.action == 'train':
        print('Loading data...')
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', args.test_path)

        test_data = dm.get_test_data('test_data')
        train_data = dm.get_data('train_data')
        semi_data = dm.get_data('semi_data')

        all_text = np.concatenate((train_data[0], semi_data[0], test_data),
                                  axis=0)
        print('Number of all_text:', all_text.shape[0])
        #print('Text sample:',all_text[0])

        print('Converting texts to words sequence...')
        text2word = []

        with_filter = 0
        if with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(
                        text,
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                        lower=True,
                        split=" "))
        if not with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(text,
                                          filters='',
                                          lower=True,
                                          split=" "))

        print('Word sequence sample:', text2word[0])

        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
        else:
            print('Building word2vec model...')
            word_vec = gensim.models.Word2Vec(text2word,
                                              size=128,
                                              min_count=15)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            if not os.path.isdir(save_path):
                os.makedirs(save_path)
            if not os.path.exists(os.path.join(save_path, 'word2vec')):
                word_vec.save((os.path.join(save_path, 'word2vec')))

        print('Coverting train_data to vector...')
        index_data = []
        i = 0
        for line in train_data[0]:
            index_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

        for i in range(len(word_vec.wv.vocab)):
            embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        index_data = pad_sequences(index_data, args.max_length)
    else:
        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

            for i in range(len(word_vec.wv.vocab)):
                embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        else:
            print('Can not load w2v model, please training w2v model first!')

    #print ('get Tokenizer...')
    #if args.load_model is not None:
    #    # read exist tokenizer
    #    dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    #else:
    #    # create tokenizer on new data
    #    dm.tokenize(args.vocab_size)
    #
    #if not os.path.isdir(save_path):
    #    os.makedirs(save_path)
    #if not os.path.exists(os.path.join(save_path,'token.pk')):
    #    dm.save_tokenizer(os.path.join(save_path,'token.pk'))
#
# mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count')
# mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count')

# convert to sequences
#dm.to_sequence(args.max_length)

# initial model
    print('initial model...')
    #model = bow_model(args,mat_train_data)
    model = simpleRNN(args, embedding_matrix)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        #path = os.path.join(load_path,'model.h5')
        if os.path.exists(load_path):
            print('load model from %s' % load_path)
            model.load_weights(load_path)
        else:
            raise ValueError("Can't find the file %s" % load_path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

# training
    if args.action == 'train':
        #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        X, X_val, Y, Y_val = train_test_split(index_data,
                                              train_data[1],
                                              test_size=0.33,
                                              random_state=42)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        print(history.history.keys())
        print('Val_acc:', history.history['val_acc'])
        print('Train_acc:', history.history['acc'])

# testing
    elif args.action == 'test':
        dm.add_test_data('test_data', args.test_path)
        test_data = dm.get_test_data('test_data')

        # Covert to vector
        index_test_data = []
        i = 0
        for line in test_data:
            index_test_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_test_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        index_test_data = pad_sequences(index_test_data, args.max_length)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        csv_path = os.path.join(args.result_path, 'prediction.csv')

        print("Predicting testing data...")
        Y_pred = model.predict(index_test_data)
        Y_pred = np.round(Y_pred)
        print('Saving result csv to', csv_path)
        with open(csv_path, 'w') as f:
            f.write('id,label\n')
            for i, v in enumerate(Y_pred):
                f.write('%d,%d\n' % (i, v))

# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')

        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(5):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=2048,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=256,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Beispiel #11
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        dm.add_data('test_data', test_path, False)
        # raise Exception ('Implement your testing parser')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)
    # dm.to_bow()

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        plot(history, args.model)
        # plot_model(model, to_file='./img/structure.png')

    # testing
    elif args.action == 'test':
        X = dm.get_data('test_data')
        print('Predict testing data...')
        result = model.predict(X)
        print('Save result...')
        saveResult(result, args.result_path)
        # raise Exception ('Implement your testing function')

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Beispiel #12
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = args.save_dir
    if args.load_model is not None:
        load_path = args.save_dir

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', test_path)
    else:
        dm.add_test_data('test_data', test_path)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

    # testing
    elif args.action == 'test':
        print(model.summary())
        [test_x] = dm.get_data('test_data')
        classes = model.predict(test_x, batch_size=32)
        with open(args.output_path, "w", encoding='utf-8') as f:
            spamwriter = csv.writer(f, delimiter=',')
            spamwriter.writerow(['id', 'label'])
            for i in range(len(classes)):
                if classes[i][0] < 0.5:
                    result = 0
                else:
                    result = 1
                spamwriter.writerow([str(i), str(result)])

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        [test_x] = dm.get_data('test_data')
        semi_all_X = np.concatenate((semi_all_X, test_x), axis=0)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(16):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
def main():
    dm = DataManager()
    dm.add_data('train_data', train_path, True)
    dm.add_data('semi_data', semi_path, False)

    print('Get Tokenizer...')
    dm.load_tokenizer('./token/token.pk')

    embedding_mat = dm.to_sequence(40, action)

    print('Initial model...')
    if action == 'train':
        model = RNN(embedding_mat)
        print(model.summary())
    elif action == 'semi':
        model = load_model('./model/model1.hdf5')
        print(model.summary())

    if action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=30,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        model.fit(X,
                  Y,
                  validation_data=(X_val, Y_val),
                  epochs=80,
                  batch_size=512,
                  callbacks=[checkpoint, earlystopping])

    elif action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', 0.2)
        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath='./model/model_semi.hdf5',
                                     verbose=1,
                                     save_best_only=True,
                                     monitor='val_acc',
                                     mode='max')
        for i in range(10):
            semi_pred = model.predict(semi_all_X, batch_size=2048, verbose=1)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred, 0.1)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            model.fit(semi_X,
                      semi_Y,
                      validation_data=(X_val, Y_val),
                      epochs=2,
                      batch_size=512,
                      callbacks=[checkpoint, earlystopping])
            print('load model from')
            model = load_model('./model/model_semi.hdf5')
Beispiel #14
0
import sys 
import keras
import _pickle as pk
import numpy as np
from keras.models import Model, Sequential, load_model

from util import DataManager

# argv settings
test_path = sys.argv[1]
output_path = sys.argv[2]
mode = sys.argv[3]

# load data
dm = DataManager()
dm.add_data('test_data',test_path,False)


if mode=='private':
  # tokenizer
  dm.load_tokenizer('./token/token.pk')
  # load model
  model = load_model('./model/model1.hdf5')
elif mode=='public':
  # tokenizer
  dm.load_tokenizer('./token/token_filter.pk')
  # load model
  model = load_model('./model/model2.hdf5')

dm.to_sequence(40,'test')
test_all_x = dm.get_data('test_data')