Ejemplo n.º 1
0
def process_predict():
    """
    说明:
        当已经训练好模型后, 利用最佳模型进行预测
    """
    # step 1 : 获取,并装载原始测试数据集 或实际待预测的数据
    myprint("Ready to load test data set ...")
    dataloader = myload.DataLoader()
    test_set = dataloader.load_test_set()

    # 有一个问题要解决:若测试文本字段的分类数量跟训练集不一致,则会造成特征列个数不同的情况,需要考虑如何处理
    # 在大数据情况下没有关系,若只取几十条,或单条进行预测,会引起报错(待解决)
    myprint("Load test data complete.")

    # step 2 : 清理数据
    myprint("Ready to prepare Data ... ")
    preparer = myprepare.PrepareData(test_set)
    preparer.prepare_data()
    test_prepared = preparer.train_prepared
    test_label = preparer.train_label
    myprint("Prepare data complete.")

    # step 3 : 用最好的训练模型进行预测
    myprint("Ready to predict.")
    best_model_name = 'random_grid_1'
    trainer = mytrain.TrainModel(best_model_name)
    test_pred, test_rmse_score = trainer.predict(test_prepared, test_label)
    myprint("Predict complete.")
    myprint("")

    myprint("Show or use predict result : ")
    # step 4 : 使用预测结果,供后续系统使用
    print(np.c_[test_label[:20], test_pred[:20]])
    print(np.c_[test_prepared[:5], test_label[:5], test_pred[:5]])
Ejemplo n.º 2
0
 def divide_train_test_data(self):
     """
     Divide data into train and test sets in a random way
     """
     test_data_share = 0.2
     seed = 0
     data = prepare_data.PrepareData()
     complete_data, labels, dictionary, reverse_dictionary = data.read_data(
     )
     np.random.seed(seed)
     dimensions = len(dictionary)
     train_data, test_data, train_labels, test_labels = train_test_split(
         complete_data,
         labels,
         test_size=test_data_share,
         random_state=seed)
     # write the test data and labels to files for further evaluation
     with h5.File(self.test_data_path, "w") as test_data_file:
         test_data_file.create_dataset("testdata",
                                       test_data.shape,
                                       data=test_data)
     with h5.File(self.test_labels_path, "w") as test_labels_file:
         test_labels_file.create_dataset("testlabels",
                                         test_labels.shape,
                                         data=test_labels)
     return train_data, train_labels, test_data, test_labels, dimensions, dictionary, reverse_dictionary
Ejemplo n.º 3
0
def process_training_model():
    """
    说明:
        执行模型的训练,需要反复调试,以获得最好的训练模型
    """
    # step 1 : 获取,并装载训练数据集
    myprint("Ready to load train data set ...")
    dataloader = myload.DataLoader()
    train_set = dataloader.load_train_set()
    myprint("Load train data complete.")

    # step 2 : 清理数据 (基于特征分析的结果,并将根据性能做相应调整)
    myprint("Ready to prepare Data ... ")
    preparer = myprepare.PrepareData(train_set)
    preparer.prepare_data()
    train_prepared = preparer.train_prepared
    train_label = preparer.train_label
    myprint("Prepare data complete.")

    # step 3 : 反复调整参数,训练模型,并记录模型的性能评分,以获得最好的训练模型
    myprint("Ready to training models and compare performance score ... ")

    # 基本模型利用缺省参数进行训练,并进行交叉验证
    trainer = mytrain.TrainModel("lin_model_1", get_default_model("linear"), train_prepared, train_label)
    trainer.train_model()

    trainer = mytrain.TrainModel("decision_model_1", get_default_model("decisiontree"), train_prepared, train_label)
    trainer.train_model()

    trainer = mytrain.TrainModel("random_model_1", get_default_model("randomforest"), train_prepared, train_label)
    trainer.train_model()

    # 自定义参数的训练模型,训练并进行交叉验证
    reg_model = RandomForestRegressor(n_estimators=10, max_features=4, bootstrap=False)
    trainer = mytrain.TrainModel("random_model_s1", reg_model, train_prepared, train_label)
    trainer.train_model()

    # GridSearchCV 训练 (耗时1,2分钟)
    param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features':[2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]}
    ]
    trainer = mytrain.GridSearchModel("random_grid_1", get_default_model("randomforest"),
                                      param_grid, train_prepared, train_label)
    trainer.train_model()  # 若传入参数 True, 将显示所有参数组合的性能评分

    myprint("")
    myprint("Training and compare models complete. ")
    myprint("All trained models are saved in folder 'trainmodels' with the same name you provided. ")
    myprint("Please choose the best model for actual predict. Thanks ! ")
    myprint("")
Ejemplo n.º 4
0
        'batch_size': batch_size,
        'units': units,
        'embedding_size': embedding_size,
        'dropout': dropout,
        'spatial_dropout': spatial_dropout,
        'recurrent_dropout': recurrent_dropout,
        'learning_rate': learning_rate
    }

    # Extract and process workflows
    connections = extract_workflow_connections.ExtractWorkflowConnections()
    workflow_paths, compatible_next_tools, standard_connections = connections.read_tabular_file(
        workflows_path)
    # Process the paths from workflows
    print("Dividing data...")
    data = prepare_data.PrepareData(maximum_path_length, test_share)
    train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred, l_tool_freq, l_tool_tr_samples = data.get_data_labels_matrices(
        workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools,
        standard_connections)
    # find the best model and start training
    predict_tool = PredictTool(num_cpus)
    # start training with weighted classes
    print("Training with weighted classes and samples ...")
    results_weighted = predict_tool.find_train_best_network(
        config, reverse_dictionary, train_data, train_labels, test_data,
        test_labels, n_epochs, class_weights, usage_pred, standard_connections,
        l_tool_freq, l_tool_tr_samples)
    utils.save_model(results_weighted, data_dictionary, compatible_next_tools,
                     trained_model_path, class_weights, standard_connections)
    end_time = time.time()
    print()