def main(): # end_year = 2016 # end_year is inclusive # part_count = 16 # the number of data files to train against # gpu_time = 0 gpu_dfs = [] perf_format_path = perf_data_path + "/Performance_%sQ%s.txt" # ETL stage ############################################################## time_ETL = time.time() for quarter in range(1, count_quarter_processing + 1): year = 2000 + quarter // 4 file = perf_format_path % (str(year), str(quarter % 4)) gpu_dfs.append( run_gpu_workflow(year=year, quarter=(quarter % 4), perf_file=file)) time_ETL_end = time.time() print("ETL time: ", time_ETL_end - time_ETL) ########################################################################## dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, 'max_leaves': 2**8, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.1, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'n_gpus': 1, # 'distributed_dask': True, 'loss': 'ls', 'objective': 'gpu:reg:linear', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } gpu_dfs = [DataFrame.from_arrow(gpu_df) for gpu_df in gpu_dfs] pd_df = gpu_dfs[0].to_pandas() y = pd_df["delinquency_12"] x = pd_df.drop(["delinquency_12"], axis=1) pd_df = xgb.DMatrix(x, y) bst = xgb.train(dxgb_gpu_params, pd_df, num_boost_round=dxgb_gpu_params['nround']) time_ML_train_end = time.time() print("Machine learning - train: ", time_ML_train_end - time_ETL_end)
def test_datetime_to_arrow(dtype): timestamp = (cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={}).reset_index()["timestamp"].reset_index(drop=True)) gdf = DataFrame({"timestamp": timestamp.astype(dtype)}) assert_eq(gdf, DataFrame.from_arrow(gdf.to_arrow(preserve_index=False)))