txt_col_name = [ 'day', # 0 'time', # 1 'direction', # 2 'road_type', # 3 'linkid', # 4 'length', # 5 'travel_time', # 6 'volumn', # 7 'speed', # 8 'occupancy', # 9 'congestion_level' # 10 ] dmc.check_file_and_pause(dmfp.pp1_train_data_path) verbose = 1 train_data = [] for file in sd.train_0707: data_csv = pd.read_csv(file, header=None, sep=',', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) train_data = ppf.process_source_file(data_csv, train_data, file, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 10, verbose) for file in sd.train_0715: data_csv = pd.read_csv(file, header=None, sep='\t', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
import dm_source_data as sd import pandas as pd import dm_preprocess_fun as ppf import dm_csv as dmcsv import os import dm_filepath as dmfp import dm_common as dmc import pickle import time from sklearn import tree from sklearn.model_selection import cross_val_score dmc.check_file_and_pause( dmfp.pp4_format_train_path ) print("Reading Training data " + dmfp.pp4_format_train_path ) traincsv = pd.read_csv( dmfp.pp4_format_train_path, sep=',' ) print("Extract data and label") item_nr = traincsv.values.__len__() data = traincsv.iloc[0:item_nr, 0:48] label = traincsv.iloc[0:item_nr, 48:54] # using decision tree print("Using Decision Tree Module") #clf = tree.DecisionTreeClassifier() #clf = tree.DecisionTreeClassifier(criterion="entropy") #clf = tree.DecisionTreeClassifier(min_samples_split=10) #clf = tree.DecisionTreeClassifier(min_samples_split=20) #clf = tree.DecisionTreeClassifier(max_depth=30) #clf = tree.DecisionTreeClassifier(min_samples_split=20, max_depth=30) #clf = tree.DecisionTreeClassifier(min_samples_leaf=10)
import dm_source_data as sd import pandas as pd import dm_preprocess_fun as ppf import dm_csv as dmcsv import os import dm_filepath as dmfp import dm_common as dmc import pickle import dm_prediction_func as funs dmc.check_file_and_pause(dmfp.pp4_format_test_without_label_path) dmc.check_file_and_pause(dmfp.pp2_linkid_map_path) linkid_map_csv = pd.read_csv(dmfp.pp2_linkid_map_path, sep=',') linkid_tcid = ppf.pp2_read_tcid_csv(linkid_map_csv) verbose = 1 files = os.listdir(dmfp.training_modules_floder_path) clfs = [None, None, None, None, None, None] for file in files: if file.__contains__("DS_Store"): continue words = file.split('.') # [0] modulename [1] predict_id [2] suffix # load module clf_file = os.path.join(dmfp.training_modules_floder_path, file) fd = open(clf_file, "rb") clfs[int(words[1])] = pickle.load(fd) fd.close() # read prediction data
train_col_name=[ 'day', # 0 <- 0 'time', # 1 <- 1 'direction', # 2 <- 2 'linkid', # 3 <- 4 'travel_time', # 4 <- 6 'volumn', # 5 <- 7 'speed', # 6 <- 8 'occupancy', # 7 <- 9 'congestion_level' # 8 <- 10 ] col_idx=[0,1,2,4,6,7,8,9,10] dmc.check_file_and_pause( dmfp.pp2_linkid_map_path ) dmc.check_file_and_pause( dmfp.pp2_direction_map_path ) print("reading linkid map" + dmfp.pp2_linkid_map_path ) linkid_map_csv = pd.read_csv( dmfp.pp2_linkid_map_path, sep=',') linkid_tcid = ppf.pp2_read_tcid_csv( linkid_map_csv ) linkid_dict = ppf.pp2_read_dict_csv( linkid_map_csv ) print("reading dirction map" + dmfp.pp2_direction_map_path ) direction_map_csv = pd.read_csv( dmfp.pp2_direction_map_path, sep=',') direction_tcid = ppf.pp2_read_tcid_csv( direction_map_csv ) direction_dict = ppf.pp2_read_dict_csv( direction_map_csv ) dmc.check_file_and_pause( dmfp.all_in_one_file_path_old ) half_hour_data_old_csv=pd.read_csv(dmfp.all_in_one_file_path_old, sep=",")
import dm_source_data as sd import pandas as pd import dm_preprocess_fun as ppf import dm_csv as dmcsv import os import dm_filepath as dmfp import dm_common as dmc dmc.check_file_and_pause(dmfp.pp3_train_data_folder) dmc.check_file_and_pause(dmfp.pp3_test_data_folder) dmc.check_file_and_pause(dmfp.pp2_linkid_map_path) linkid_map_csv = pd.read_csv(dmfp.pp2_linkid_map_path, sep=',') linkid_dict = ppf.pp2_read_dict_csv(linkid_map_csv) train_col_name = [ 'day', # 0 'time', # 1 'direction', # 2 'linkid', # 3 'travel_time', # 4 'volumn', # 5 'speed', # 6 'occupancy', # 7 'congestion_level' # 8 ] format_data_cols = [0, 1, 2, 3, 4, 5, 6, 7] format_data_nr = 6 format_label_col = 8 format_label_nr = 6
import dm_source_data as sd import pandas as pd import dm_preprocess_fun as ppf import dm_csv as dmcsv import os import dm_filepath as dmfp import dm_common as dmc import pickle import dm_prediction_func as funs dmc.check_file_and_pause(dmfp.prediction_result_floder_path) res_files = os.listdir(dmfp.prediction_result_floder_path) res_files = sorted(res_files) for file in res_files: if file[0] == '.': continue start_time = int(file.split('.')[2]) csv = pd.read_csv(os.path.join(dmfp.prediction_result_floder_path, file), sep=",") # [0] linkid # [1] linkid's tag # [2] predicted value 1 # [3] predicted value 2 # [4] predicted value 3 # [5] predicted value 4 # [6] predicted value 5 # [7] predicted value 6
import dm_source_data as sd import pandas as pd import dm_preprocess_fun as ppf import dm_csv as dmcsv import os import dm_filepath as dmfp import dm_common as dmc dmc.check_file_and_pause(dmfp.sta_unknown_percentage_path) dmc.check_file_and_pause(dmfp.pp2_train_data_path) dmc.check_file_and_pause(dmfp.pp2_test_data_path) dmc.check_file_and_pause(dmfp.pp2_direction_map_path) dmc.check_file_and_pause(dmfp.pp2_linkid_map_path) train_col_name = [ 'day', # 0 'time', # 1 'direction', # 2 'linkid', # 3 'travel_time', # 4 'volumn', # 5 'speed', # 6 'occupancy', # 7 'congestion_level' # 8 ] direction_col = 2 linkid_col = 3 direction_map_csv = pd.read_csv(dmfp.pp2_direction_map_path, sep=',') direction_dict = ppf.pp2_read_dict_csv(direction_map_csv)