def pred_eval(): """Function used to evaluate the prediction result of the model Args: Returns: Pandas DataFrame: Root Mean Square Error value of all the three models. """ logging.info("Evaluating the Housing Value of all the three Models....") _, housing_labels = data_preprocessing.data_preprocess() ( Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction, ) = prediction.predict() lin_mse = mean_squared_error(housing_labels, Linear_Model_prediction) lin_rmse = np.sqrt(lin_mse) # lin_mae = mean_absolute_error(housing_labels, Linear_Model_prediction) tree_mse = mean_squared_error(housing_labels, DT_Model_prediction) tree_rmse = np.sqrt(tree_mse) _, y_test = data_preprocessing.rfdata() final_mse = mean_squared_error(y_test, RF_Model_prediction) final_rmse = np.sqrt(final_mse) return lin_rmse, tree_rmse, final_rmse
def PredictByAllModel(): train_df, test_df, sale_price, _ = data_preprocess() x_train, y_train, x_test, oridata = load_data() ave_model = Ave_Model() ave_model.fit(x_train, y_train) y_predict1 = np.array(ave_model.predict(x_test)) deep_learning_model = DeepLearningModel(x_train.shape[1]) deeplearningModel = deep_learning_model.build_model() deeplearningModel.fit(x_train,y_train,epochs=300) deeplearningModel.save('model.h5') prices = [] for i in range(x_test.shape[0]): print(i) price = x_test[i].reshape((1, x_test.shape[1])) prices.append(deeplearningModel.predict(price)) i += 1 y_predict2 = np.array(prices).reshape((-1,)) w1 = 0.9 w2 = 0.1 y_predict = y_predict1 * w1 + y_predict2 * w2 y_predict *= (oridata['SalePrice']['max']-oridata['SalePrice']['min']) y_predict += oridata['SalePrice']['min'] y_predict = np.expm1(y_predict) submission_df = pd.DataFrame(data={'Id':test_df.index,'SalePrice':y_predict}) print(submission_df.head(10)) submission_df.to_csv('submission11.csv',index=False)
def predict(): housing_prepared, housing_labels = data_preprocessing.data_preprocess() linear, dt, rnd = train_model.model_train() Linear_Model_prediction = linear.predict(housing_prepared) DT_Model_prediction = dt.predict(housing_prepared) X_test_prepared, y_test = data_preprocessing.rfdata() RF_Model_prediction = rnd.predict(X_test_prepared) return Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction
def model_train(): """Main Function to train all the three models. Returns: Pandas DataFrame: Linear Model, Decision Tree Model, Random Forest Model. """ logging.info("Running the Main function for training model....") housing_prepared, housing_labels = data_preprocessing.data_preprocess() linear = linear_model_(housing_prepared, housing_labels) dt = dtreg(housing_prepared, housing_labels) rnd = rnd_forest(housing_prepared, housing_labels) return linear, dt, rnd
def PredictByAve_Model(): train_df, test_df, sale_price, _ = data_preprocess() x_train, y_train, x_test, oridata = load_data() ave_model = Ave_Model() score = rmse_cv(ave_model,x_train,y_train) print(score.mean()) ave_model.fit(x_train,y_train) y_predict = np.array(ave_model.predict(x_test)) y_predict *= (oridata['SalePrice']['max']-oridata['SalePrice']['min']) y_predict += oridata['SalePrice']['min'] y_predict = np.expm1(y_predict) submission_df = pd.DataFrame(data={'Id':test_df.index,'SalePrice':y_predict}) print(submission_df.head(10)) submission_df.to_csv('submission7.csv',index=False)
def pred_eval(): _, housing_labels = data_preprocessing.data_preprocess() ( Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction, ) = prediction.predict() lin_mse = mean_squared_error(housing_labels, Linear_Model_prediction) lin_rmse = np.sqrt(lin_mse) # lin_mae = mean_absolute_error(housing_labels, Linear_Model_prediction) tree_mse = mean_squared_error(housing_labels, DT_Model_prediction) tree_rmse = np.sqrt(tree_mse) _, y_test = data_preprocessing.rfdata() final_mse = mean_squared_error(y_test, RF_Model_prediction) final_rmse = np.sqrt(final_mse) return lin_rmse, tree_rmse, final_rmse
def predict(): """Predict() **Prediction File** The function is used to create the prediction file for all the three models Args: Returns: Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction """ logging.info("Predicting the Median Housing Value....") housing_prepared, housing_labels = data_preprocessing.data_preprocess() linear, dt, rnd = train_model.model_train() Linear_Model_prediction = linear.predict(housing_prepared) DT_Model_prediction = dt.predict(housing_prepared) X_test_prepared, y_test = data_preprocessing.rfdata() RF_Model_prediction = rnd.predict(X_test_prepared) return Linear_Model_prediction, DT_Model_prediction, RF_Model_prediction
'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline', 'residential_percent_change_from_baseline ' ] column_identifier = { 'spatial id level 1': 'country_code', 'temporal id level 1': 'Date', 'temporal covariates': temporal_covariates, 'target': target_name } history_length = {key: max_history for key in temporal_covariates} historical_data_list = data_preprocess( data=data.copy(), forecast_horizon=forecast_horizon, history_length=history_length, column_identifier=column_identifier, spatial_scale_table=None, spatial_scale_level=1, temporal_scale_level=1, target_mode='normal', imputation=False, aggregation_mode='mean', augmentation=False, futuristic_covariates=None, future_data_table=None, save_address='', # <------------ save address verbose=1)
def __init__(self): preprocess = pre.data_preprocess() self.dataset = preprocess.clean() self.file_init = False self.file_name = 'feature_outputs/'
"cure": ("./data/cure_and_prevention-add_text.jsonl", "./data/cure_and_prevention.pkl"), } # REDO_DATA_FLAG = True REDO_DATA_FLAG = False REDO_FLAG = True RETRAIN_FLAG = True # REDO_FLAG = False # We will save all the tasks and subtask's results and model configs in this dictionary all_task_results_and_model_configs = dict() # We will save the list of question_tags AKA subtasks for each event AKA task in this dict all_task_question_tags = dict() for taskname, (data_in_file, processed_out_file) in task_type_to_datapath_dict.items(): if not os.path.exists(processed_out_file) or REDO_DATA_FLAG: data_preprocess(data_in_file, processed_out_file) else: logging.info(f"Preprocessed data for task {taskname} already exists at {processed_out_file}") # Read the data statistics task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(processed_out_file) # We will store the list of subtasks for which we train the classifier tested_tasks = list() logging.info(f"Training Mutlitask BERT Entity Classifier model on {processed_out_file}") # output_dir = os.path.join("results", "multitask_bert_entity_classifier", taskname) # NOTE: After fixing the USER and URL tags output_dir = os.path.join("results", "multitask_bert_entity_classifier_fixed", taskname) make_dir_if_not_exists(output_dir) results_file = os.path.join(output_dir, "results.json") model_config_file = os.path.join(output_dir, "model_config.json")
def model_train(): housing_prepared, housing_labels = data_preprocessing.data_preprocess() linear = linear_model_(housing_prepared, housing_labels) dt = dtreg(housing_prepared, housing_labels) rnd = rnd_forest(housing_prepared, housing_labels) return linear, dt, rnd
def ml(class_num, epochs, method, source_data, twitter_source, google_source, ig_source, judge=True, nan=True): if judge == True: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) input_x, revised_y = data_preprocess(final_data, nan=nan) a = pd.to_datetime(final_data['上映日期']) cut = a.dt.weekofyear test_list = [] train_list = [] test = 0 train = 0 # print(cut) for i in range(len(final_data)): if cut[i] % 4 == 0: test += 1 test_list.append(i) else: train += 1 train_list.append(i) # print(train_list) # print(test_list) print(train) print(test) # print("final",final_data.shape) train_final_data = final_data test_final_data = final_data for i in test_list: train_final_data = train_final_data.drop(final_data.index[i]) # print(len(train_final_data)) train_final_data = train_final_data.reset_index(drop=True) # print(train_new_youtube_file_v3_data) for i in train_list: test_final_data = test_final_data.drop(final_data.index[i]) # print(len(test_new_youtube_file_v3_data)) test_final_data = test_final_data.reset_index(drop=True) # print(test_new_youtube_file_v3_data) X_train, y_train = data_preprocess(train_final_data, nan=nan) X_test, y_test = data_preprocess(test_final_data, nan=nan) if method == "random_forest": y_test, ans_best = random_forest(input_x, revised_y, X_train, y_train, X_test, y_test, judge=judge) elif method == "decision_tree": y_test, ans_best = decision_tree(input_x, revised_y, X_train, y_train, X_test, y_test, judge=judge) else: y_test, ans_best = xgboost(input_x, revised_y, X_train, y_train, X_test, y_test, class_num=class_num, num=epochs, judge=judge) if class_num == 2: classes = ['0', '1'] elif class_num == 4: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(y_test, ans_best, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show() else: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) input_x, revised_y = data_preprocess(final_data, nan=nan) if method == "random_forest": y_test, ans_best = random_forest(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, judge=judge) elif method == "decision_tree": y_test, ans_best = decision_tree(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, judge=judge) else: y_test, ans_best = xgboost(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, class_num=class_num, num=epochs, judge=judge) if class_num == 2: classes = ['0', '1'] elif class_num == 4: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(y_test, ans_best, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show()
def dl(source_data, twitter_source, google_source, ig_source, class_num, epochs, batch_size, optimizer, loss, judge=True, nan=True): if judge == True: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) a = pd.to_datetime(final_data['上映日期']) cut = a.dt.weekofyear test_list = [] train_list = [] test = 0 train = 0 # print(cut) for i in range(len(final_data)): if cut[i] % 4 == 0: test += 1 test_list.append(i) else: train += 1 train_list.append(i) # print(train_list) # print(test_list) print(train) print(test) train_final_data = final_data test_final_data = final_data for i in test_list: train_final_data = train_final_data.drop(final_data.index[i]) # print(len(train_final_data)) train_final_data = train_final_data.reset_index(drop=True) # print(train_final_data) for i in train_list: test_final_data = test_final_data.drop(final_data.index[i]) # print(len(test_final_data)) test_final_data = test_final_data.reset_index(drop=True) # print(test_new_final_data) X_train, y_train = data_preprocess(train_final_data, nan=nan) X_test, y_test = data_preprocess(test_final_data, nan=nan) # print(X_train.shape) ohe = OneHotEncoder() y_train = ohe.fit_transform(y_train.reshape(-1, 1)).toarray() y_test = ohe.fit_transform(y_test.reshape(-1, 1)).toarray() sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) nn(X_train, X_test, y_train, y_test, class_num=class_num, input_dim=X_train.shape[1], epochs=epochs, batch_size=batch_size, optimizer=optimizer, loss=loss) else: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) input_x, revised_y = data_preprocess(final_data, nan=nan) ohe = OneHotEncoder() revised_y = ohe.fit_transform(revised_y.reshape(-1, 1)).toarray() X_train, X_test, y_train, y_test = train_test_split(input_x, revised_y, test_size=0.3, random_state=42) print("labels") check_list = [] for i in range(len(X_test)): # print(X_test[i][input_x.shape[1]-1]) check_list.append(X_test[i][input_x.shape[1] - 1]) check_list.sort() print(check_list) X_train = np.delete(X_train, -1, axis=1) X_test = np.delete(X_test, -1, axis=1) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) nn(X_train, X_test, y_train, y_test, class_num=class_num, input_dim=input_x.shape[1], epochs=epochs, batch_size=batch_size, optimizer=optimizer, loss=loss)
def load_data(): train_df, test_df, sale_price, oridatas = data_preprocess() x_train = train_df.values x_test = test_df.values y_train = np.array(sale_price) return x_train, y_train, x_test, oridatas