Ejemplo n.º 1
0
def train(lang='pt'):
    params = PARAMS.copy()
    initial_epoch = 0
    X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump")
    X = np.asarray(X)
    params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump")
    params["vocab_size"] = params['embedding_matrix'].shape[0]
    params["embedding_dim"] = params['embedding_matrix'].shape[1]
    
    if not os.path.exists(PATH):
        os.makedirs(PATH)
    if not os.path.exists(PATH+'log_dir'):
        os.makedirs(PATH+'log_dir')
        
    #params["loss"] = util.focal_loss(gamma=5.,alpha=1588)
    lastest_model = load_lastest(lang=lang)
    if(lastest_model == None):
        model, params = generate_model(params)
    else:
        model = lastest_model[0]
        initial_epoch = lastest_model[1]
        
    print(model.metrics_names)
    
    params['sampler'] = FunctionSampler(func=balance_dataset,
                          kw_args={'cut_off': 0.5,
                                  'random_state': 42})
    
    data_generator = DataGenerator(X,Y, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], sampler=params['sampler'])
    #data_generator.remove_reliable_0(pct=1.0)
    validation_data = data_generator.get_validation_data()
    print('data_generator.x: ', data_generator.__getitem__(0)[0][0:5])
    print('data_generator.y: ', data_generator.__getitem__(0)[1][0:5])

    #params["class_weights"]= data_generator.get_classes_weights()
    
    reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1)
    early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1)
    csv_logger = CSVLogger(PATH+'traning.log', append=True)
    tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"])
    model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5',
                                               monitor='val_categorical_accuracy',
                                               verbose=1,
                                               mode='max')
    params["callbacks"] = [model_checkpoint, early_stopping, tensorboard_callback, csv_logger, reduce_lr]
    
    model.fit_generator(data_generator,
                        epochs=params["epochs"],
                        verbose=1,
                        callbacks=params["callbacks"],
                        validation_data=validation_data,
                        #workers=7,
                        #use_multiprocessing=True,
                        class_weight=params["class_weights"],
                        initial_epoch=initial_epoch)
def train(lang='pt'):
    params = PARAMS.copy()
    initial_epoch = 0
    X, Y = util.get_X_Y(data_type='keras_tokenized_tri', lang=lang, file_type="dump")
    X = np.asarray(X)
    params['embedding_matrix'] = load_embedding_matrix(name="fasttext_sg_tri_8", tokenizer='keras_tokenized_tri',lang=lang, model_type="dump")
    params["vocab_size"] = params['embedding_matrix'].shape[0]
    params["embedding_dim"] = params['embedding_matrix'].shape[1]
    
    if not os.path.exists(PATH):
        os.makedirs(PATH)
    if not os.path.exists(PATH+'log_dir'):
        os.makedirs(PATH+'log_dir')
    
    kfold_count = 1
    skf = StratifiedKFold(n_splits=params['k-folds'], shuffle=True)
    for train_index, test_index in skf.split(X, Y):
        print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
        X_train, Y_train = X[train_index], Y[train_index]
        X_test, Y_test = X[test_index], Y[test_index]
        
        params['sampler'] =  FunctionSampler(func=balance_dataset,
                          kw_args={'cut_off': 0.5,
                                  'random_state': np.random.randint(0, 100000)})
        
        model, params = generate_model(params)
        
        print(model.metrics_names)
        
        data_g_train = DataGenerator(X_train, Y_train, lang=lang, process_x=process_x, process_y=process_y, sampler=params['sampler'], batch_size=PARAMS['batch_size'], separate_val=False)
        data_g_val = DataGenerator(X_test, Y_test, lang=lang, process_x=process_x, process_y=process_y, batch_size=PARAMS['batch_size'], separate_val=False)
        print('data_generator.x: ', data_g_train.__getitem__(0)[0][0:5])
        print('data_generator.y: ', data_g_train.__getitem__(0)[1][0:5])

        #params["class_weights"]= data_generator.get_classes_weights()

        reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.2, patience=3, verbose=1)
        early_stopping = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0.02, patience=10, verbose=1)
        csv_logger = CSVLogger(PATH+'traning.log', append=True)
        tensorboard_callback = TensorBoard(log_dir=PATH+'log_dir', batch_size=params["batch_size"])
        model_checkpoint = ModelCheckpoint(filepath=PATH+'weights-{epoch:03d}-{val_categorical_accuracy:.4f}-'+lang+'.hdf5',
                                                   monitor='val_categorical_accuracy',
                                                   verbose=1,
                                                   mode='max')
        clr = CyclicLR(base_lr=1e-3, max_lr=2e-3,
                   step_size=300., mode='exp_range',
                   gamma=0.99994)

        params["callbacks"] = [tensorboard_callback, csv_logger, clr]

        model.fit_generator(data_g_train,
                            epochs=params["epochs"],
                            verbose=1,
                            callbacks=params["callbacks"],
                            #workers=7,
                            #use_multiprocessing=True,
                            class_weight=params["class_weights"],
                            initial_epoch=initial_epoch)
        
        batch_val = 1000
        data_g_val.set_batch_size(batch_val)
        y_pred = np.zeros(Y_test.shape)
        y_val = np.zeros(Y_test.shape)
        for i, (x, y) in enumerate(data_g_val):
            y_pred[i*batch_val:(i+1)*batch_val] = np.argmax(model.predict(x), axis=1)
            y_val[i*batch_val:(i+1)*batch_val] = np.argmax(y, axis=1)
        result = util.evaluate(y_val, y_pred)
        print('Model '+NAME+' val score on '+lang+', k-fold-'+str(kfold_count)+': ', result)
        model.save(PATH+'weights-{epoch:03d}-kfold{fold}-{result:.4f}-{lang}.hdf5'.format(epoch=params["epochs"], result=result, lang=lang, fold=kfold_count))
        del data_g_train, data_g_val
        del model
        del X_train, Y_train, X_test, Y_test
        K.clear_session()
        gc.collect()
        kfold_count += 1