def preprocessing_raw_csv(PATH=".//tcdata//hy_round2_train_20200225//", local_file_name="train.pkl"): """Loading and processing all train csv data.""" if PATH is None: raise ValueError("Invalid PATH !") file_names = sorted(os.listdir(PATH), key=lambda s: int(s.split(".")[0])) # Loading all trajectory data. traj_data = [] for name in file_names: traj_data.append(pd.read_csv(PATH + name, encoding="utf-8")) # Processing each trajectory data. print("\n@Multi-processing RAW CSV started:") print("-----------------------------") with mp.Pool(processes=mp.cpu_count()) as p: tmp = list(tqdm(p.imap(preprocessing_traj, traj_data), total=len(traj_data))) print("-----------------------------") print("@Multi-processing RAW CSV ended, to the local file: {}.\n".format( local_file_name)) traj_data = [item[0] for item in tmp] change_record = [item[1] for item in tmp] change_record = pd.DataFrame(change_record, columns=["speed_change", "coord_change"]) # Saving processed data to the lcoal path with *.pkl format file_processor = LoadSave(PATH) file_processor.save_data(path=".//tcdata_tmp//{}".format(local_file_name), data=traj_data) return change_record
def traj_data_train_test_split(train_ratio=0.85): """Split the training data into training dataset and testing dataset, This is for the online docker docker evaluation testing. """ PATH = ".//tcdata//hy_round2_train_20200225_local//" file_names = sorted(os.listdir(PATH), key=lambda s: int(s.split(".")[0])) print("\n@Read all raw traj data started at: {}".format(datetime.now())) print("-----------------------------") with mp.Pool(processes = mp.cpu_count()) as p: traj_data_total = list(tqdm(p.imap(read_traj, file_names), total=len(file_names))) print("-----------------------------") print("@End at: {}".format(datetime.now())) # Map the Chinese labels into the numerics str_to_label = {"刺网": 2, "围网": 1, "拖网": 0} target = [traj["type"].unique()[0] for traj in traj_data_total] target = np.array([str_to_label[i] for i in target]) train_index, test_index = traj_data_train_test_index_generation( train_ratio=train_ratio, n_samples=len(traj_data_total), target=target, method="stratified") traj_data_train = [traj_data_total[i] for i in train_index] traj_data_train_fnames = [file_names[i] for i in train_index] traj_data_test = [traj_data_total[i] for i in test_index] traj_data_test_fnames = [file_names[i] for i in test_index] train_target_dist = [target[i] for i in train_index] test_target_dist = [target[i] for i in test_index] print("@Total target distributions: {}".format( np.bincount(target)/len(target))) print("@Train distributions: {}".format( np.bincount(train_target_dist)/len(traj_data_train))) print("@Test distributions: {}".format( np.bincount(test_target_dist)/len(traj_data_test))) TEST_TARGET_PATH = ".//tcdata_tmp//" boat_id = [int(file_names[i].split(".")[0]) for i in test_index] df = pd.DataFrame({"boat_id": boat_id, "target": test_target_dist}) file_processor = LoadSave() file_processor.save_data(data=df, path=TEST_TARGET_PATH+"test_target.pkl") TRAIN_DATA_PATH = ".//tcdata//hy_round2_train_20200225//" file_names = os.listdir(TRAIN_DATA_PATH) if len(file_names) != 0: raise ValueError("The dir is not empty ! Please remove all file ~~") for df, name in zip(traj_data_train, traj_data_train_fnames): df.to_csv(TRAIN_DATA_PATH + name, index=False, encoding="utf-8") TEST_PATH = ".//tcdata//hy_round2_testA_20200225//" file_names = os.listdir(TEST_PATH) if len(file_names) != 0: raise ValueError("The dir is not empty ! Please remove all files ~~") for df, name in zip(traj_data_test, traj_data_test_fnames): df.to_csv(TEST_PATH + name, index=False, encoding="utf-8") return traj_data_train, traj_data_test
def traj_data_signal_embedding(): """Loading the embedding vectors.""" file_processor = LoadSave() train_embedding = file_processor.load_data( path=".//tcdata_tmp//train_signal_embedding.pkl") test_embedding = file_processor.load_data( path=".//tcdata_tmp//test_signal_embedding.pkl") return pd.concat([train_embedding, test_embedding], axis=0, ignore_index=True)
def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None): """Find and save the unique boat visit count of each bin.""" unique_boat_count_df = traj_data_df.groupby(["no_bin"])["boat_id"].nunique().reset_index() unique_boat_count_df.rename({"boat_id":"visit_boat_count"}, axis=1, inplace=True) unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df, on="no_bin", how="left") file_processor = LoadSave() file_processor.save_data(data=unique_boat_count_df_save, path=".//tcdata_tmp//bin_unique_boat_count_frequency.pkl") return unique_boat_count_df
def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None): """Find and save the visit frequency of each bin.""" visit_count_df = traj_data_df.groupby(["no_bin"]).count().reset_index() visit_count_df = visit_count_df[["no_bin", "x"]] visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True) visit_count_df_save = pd.merge(bin_to_coord_df, visit_count_df, on="no_bin", how="left") file_processor = LoadSave() file_processor.save_data(data=visit_count_df_save, path=".//tcdata_tmp//bin_visit_count_frequency.pkl") return visit_count_df
def preprocessing_mnist(n_data_list=None): img_data = pd.read_csv("..//demo_dataset//mnist//train.csv", nrows=None) img_cols = [name for name in img_data.columns if "pixel" in name] img_data = img_data[img_cols].values / 255 img_data_list = img_data.tolist() # Save the proprocessed data file_name = [".//data//mnist_{}.pkl".format(i) for i in n_data_list] file_processor = LoadSave() for ind, item in enumerate(n_data_list): tmp_img_data = img_data_list[:item] tmp_file_name = file_name[ind] file_processor.save_data(path=tmp_file_name, data=tmp_img_data)
def find_save_mean_stay_time_table(traj_data_df=None, bin_to_coord_df=None): """Find and save the mean stay time of each bin.""" mean_stay_time_df = traj_data_df.groupby( ["no_bin", "boat_id"])["time_array"].sum().reset_index() mean_stay_time_df.rename({"time_array":"total_stay_time"}, axis=1, inplace=True) mean_stay_time_df = mean_stay_time_df.groupby( ["no_bin"])["total_stay_time"].mean().reset_index() mean_stay_time_df.rename( {"total_stay_time":"mean_stay_time"}, axis=1, inplace=True) mean_stay_time_df_save = pd.merge(bin_to_coord_df, mean_stay_time_df, on="no_bin", how="left") file_processor = LoadSave() file_processor.save_data(data=mean_stay_time_df_save, path=".//tcdata_tmp//bin_mean_stay_time.pkl") return mean_stay_time_df
def save_ais_traj_to_csv(ais=None, round_to_print=50000, local_file_name="ais.pkl"): """Save the trajectory according to the ais record with the csv format.""" ais_id_list = ais["ais_id"].astype(int).values.tolist() # Split the DataFrame ais_traj_list = [] head, tail = 0, 0 print("\n@Split AIS and save the traj in *.csv format:") print("---------------------------------------") while (tail <= (len(ais_id_list) - 1)): if tail % round_to_print == 0: print("--Now tail is on {}, completed {:.2f}%.".format( tail, (tail + 1) / len(ais_id_list) * 100)) print("--time is {}.\n".format(datetime.now())) if ais_id_list[head] == ais_id_list[tail]: tail += 1 elif ais_id_list[head] != ais_id_list[tail]: ais_traj_list.append(ais.iloc[head:tail]) head = tail ais_traj_list.append(ais.iloc[head:]) print("---------------------------------------") # Coordinate transferring # tmp = [] # for i in range(50): # tmp.append(preprocessing_traj(ais_traj_list[i])) print("\n@AIS list index resetting:") print("---------------------------------------") with mp.Pool(processes=mp.cpu_count()) as p: tmp = list( tqdm(p.imap(preprocessing_traj, ais_traj_list), total=len(ais_traj_list))) print("---------------------------------------") print("@Save to the local file: {}.\n".format(local_file_name)) traj_data = [item[0] for item in tmp if len(item[0]) > 1] change_record = [item[1] for item in tmp] change_record = pd.DataFrame(change_record, columns=["speed_change", "coord_change"]) # Saving processed data to the lcoal path with *.pkl format file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//{}".format(local_file_name), data=traj_data) return traj_data, change_record
def preparing_traj_data_corpus(bin_size=600): """Preparing the training corpus for the traj2vec model.""" # Loading all the data train_data = load_data("train_semantic.pkl") test_data = load_data("test_semantic.pkl") ais_data = load_data("ais_semantic.pkl") train_concat = load_data("train_semantic_concat.pkl") test_concat = load_data("test_semantic_concat.pkl") # Print statistics x_min = min(train_concat["x"].min(), test_concat["x"].min()) x_max = max(train_concat["x"].max(), test_concat["x"].max()) y_min = min(train_concat["y"].min(), test_concat["y"].min()) y_max = max(train_concat["y"].max(), test_concat["y"].max()) col_bins = int((x_max - x_min) / bin_size) row_bins = int((y_max - y_min) / bin_size) # Start cutting the traj to bins traj_total = train_data + test_data + ais_data res = [] # Multi-processing for loop. partial_work = partial(traj_to_bin, col_bins=col_bins, row_bins=row_bins, x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max) with mp.Pool(processes=mp.cpu_count()) as p: res = list( tqdm(p.imap(partial_work, traj_total), total=len(traj_total))) unique_words = [traj["no_bin"].nunique() for traj in res] print("\n@Cutting results basic stat:") print("-----------------------------") print("@Mean uniques: {:.5f}, max: {}, median: {:.5f}, std: {:.5f}".format( np.mean(unique_words), np.max(unique_words), np.median(unique_words), np.std(unique_words))) print("-----------------------------\n") file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//traj_data_corpus.pkl", data=res)
def preprocessing_HAR(n_data_list=None): file_processor = LoadSave() har_dataset, har_dataset_label = file_processor.load_data( path= "..//demo_dataset//human_activity_recognition//human_activity_recognition.pkl" ) har_dataset_label = np.array(har_dataset_label) # Shuffle the dataset ind = np.random.choice(np.arange(0, len(har_dataset_label)), size=len(har_dataset_label), replace=False) har_dataset = har_dataset[ind] har_dataset_label = har_dataset_label[ind] for ind in range(len(n_data_list)): if n_data_list[ind] is None: n_data_list[ind] = len(har_dataset) file_name = [ ".//data//human_activity_recognition_{}.pkl".format(i) for i in n_data_list ] file_processor = LoadSave() for ind, item in enumerate(n_data_list): tmp_data = har_dataset[:item] tmp_data_label = har_dataset_label[:item] tmp_file_name = file_name[ind] file_processor.save_data(path=tmp_file_name, data=[tmp_data, tmp_data_label])
def preprocessing_turnout(n_data_list=None): file_processor = LoadSave() signal_data_list = [] # Experiment fault data def liststr_to_listnumeric(list_str): return list(map(float, list_str.split(","))) signal_fault_data = pd.read_csv( "..//demo_dataset//turnout//fault_data.csv", nrows=None).query("error_code != 0").reset_index(drop=True) signal_fault_data["Phase_A"] = signal_fault_data["Phase_A"].apply( liststr_to_listnumeric) signal_fault_data["Phase_B"] = signal_fault_data["Phase_B"].apply( liststr_to_listnumeric) signal_fault_data["Phase_C"] = signal_fault_data["Phase_C"].apply( liststr_to_listnumeric) for i in range(len(signal_fault_data)): signal = [ signal_fault_data["Phase_A"].iloc[i], signal_fault_data["Phase_B"].iloc[i], signal_fault_data["Phase_C"].iloc[i] ] signal_data_list.append(signal) # Operation fault data signal_data = file_processor.load_data( path="..//demo_dataset//turnout//chengdu5_raw_table.pkl") signal_anomaly_scores = file_processor.load_data( path="..//demo_dataset//turnout//chengdu5_anomaly_scores.pkl") signal_data = pd.merge(signal_data, signal_anomaly_scores, on=["device_id", "record_id"], how="left") signal_data = signal_data.sort_values( by="if_score", ascending=False).reset_index(drop=True) for i in range(len(signal_data)): signal = [ signal_data["phase_a"].iloc[i], signal_data["phase_b"].iloc[i], signal_data["phase_c"].iloc[i] ] signal_data_list.append(signal) # Save the proprocessed data for ind in range(len(n_data_list)): if n_data_list[ind] is None: n_data_list[ind] = len(signal_data_list) file_name = [ ".//data//fault_turnout_current_{}.pkl".format(i) for i in n_data_list ] for ind, item in enumerate(n_data_list): tmp_signal_data = signal_data_list[:item] tmp_file_name = file_name[ind] file_processor.save_data(path=tmp_file_name, data=tmp_signal_data)
def traj_data_labeling_semantics(): ''' Step 1: Load all possible stop grids. ''' traj_data_list, train_nums, test_nums = load_concat_train_test_ais() pois = poi_classification() ''' Step 2: Find all candiate stop points. ''' # Label all semantic points nn = NearestNeighbors(n_neighbors=1, radius=400) clf = nn.fit(pois[["x", "y"]].values) traj_data_semantic = label_traj_data_semantics( traj_data_list, clf, pois.drop(["x", "y"], axis=1)) # Spliting the training and testing data train_data = traj_data_semantic[:train_nums] test_data = traj_data_semantic[train_nums:(train_nums+test_nums)] ais_data = traj_data_semantic[(train_nums+test_nums):] # Save all data and concat the training and testing data print("\n@Semantic labeling results:") print("-----------------------------") print("#training: {}, #testing: {}, #AIS: {}.".format( len(train_data), len(test_data), len(ais_data))) print("-----------------------------\n") file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//train_semantic.pkl", data=train_data) file_processor.save_data(path=".//tcdata_tmp//test_semantic.pkl", data=test_data) file_processor.save_data(path=".//tcdata_tmp//ais_semantic.pkl", data=ais_data) ''' Step 3: Concat a list of traj data. ''' concat_list_data(data_list=train_data, local_file_name="train_semantic_concat.pkl") concat_list_data(data_list=test_data, local_file_name="test_semantic_concat.pkl")
def traj_data_cbow_embedding_generating(embedding_size=70, iters=70, min_count=3, window_size=25, num_runs=1): traj_corpus = load_data("traj_data_corpus.pkl") train_nums = len( sorted(os.listdir(".//tcdata//hy_round2_train_20200225//"), key=lambda s: int(s.split(".")[0]))) test_nums = len( sorted(os.listdir(".//tcdata//hy_round2_testA_20200225//"), key=lambda s: int(s.split(".")[0]))) df_list, model_list = traj_cbow_embedding(traj_corpus, embedding_size=embedding_size, iters=iters, min_count=min_count, window_size=window_size, seed=9012, num_runs=num_runs, word_feat="no_bin") train_embedding_df_list = [ df.iloc[:train_nums].reset_index(drop=True) for df in df_list ] test_embedding_df_list = [ df.iloc[train_nums:(train_nums + test_nums)].reset_index(drop=True) for df in df_list ] file_processor = LoadSave() file_processor.save_data( path=".//tcdata_tmp//train_embedding_cbow_list.pkl", data=train_embedding_df_list) file_processor.save_data( path=".//tcdata_tmp//test_embedding_cbow_list.pkl", data=test_embedding_df_list)
tmp_df.rename( { "DOTTING_TIME": "DOTTING_TIME_SHIFT", "QUEUE_ID": "QUEUE_ID_SHIFT", feat_name: target_feat_name }, axis=1, inplace=True) target_df = pd.concat([target_df, tmp_df], axis=1) # Exclude invalid target values target_df[target_feat_name][ target_df["QUEUE_ID"] != target_df["QUEUE_ID_SHIFT"]] = np.nan target_df[target_feat_name][ abs(target_df["DOTTING_TIME"] - target_df["DOTTING_TIME_SHIFT"]) > tol_time_diff] = np.nan # Drop tmp columns target_df.drop(["DOTTING_TIME_SHIFT", "QUEUE_ID_SHIFT"], axis=1, inplace=True) # Save feat engineering results # ---------------------------- file_processor = LoadSave() total_results = [feat_df, target_df] file_processor.save_data( path=".//cached_data//{}.pkl".format("nn_dense_feat"), data=total_results)
def load_data(name=None): """Load data from .//tcdata_tmp//""" file_processor = LoadSave() data = file_processor.load_data(path=".//tcdata_tmp//" + name) return data
################################################## train_feature = total_features.iloc[:train_nums].reset_index( drop=True).copy() test_feature = total_features.iloc[train_nums:].reset_index( drop=True).copy() train_feature["target"] = labels print("\n-- Train samples: {}, testA samples: {}.".format( len(train_feature), len(test_feature))) print("-- Train cols: {}, test cols: {}.".format(train_feature.shape[1], test_feature.shape[1])) print("-- Unique train cols: {}, unique testA cols: {}.\n".format( len(np.unique(train_feature.columns)), len(np.unique(test_feature.columns)))) file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//train_feature_xgb.pkl", data=train_feature) file_processor.save_data(path=".//tcdata_tmp//train_target.pkl", data=train_feature[["boat_id", "target"]]) file_processor.save_data(path=".//tcdata_tmp//test_feature_xgb.pkl", data=test_feature) gc.collect() for embedding_id in [0, 1, 2]: xgb_res_list = xgb_clf_embedding_list_train(folds=5, id_list=[embedding_id], embedding_enable=True) df = training_res_to_log(training_res=xgb_res_list[0][0], comment="xgb_{}".format(embedding_id))
def traj_data_poi_mining(visit_count_minimum=200, visit_boat_minimum=3, mean_stay_minutes=120, bin_size=800): ''' Step 1: Find all possible stop grids. ''' traj_data_list, train_nums, test_nums = load_concat_train_test_ais() print("\n@Step 1: traj2bin:") print("-----------------------------") col_bins = int((14226964.881853 - 12031967.16239096) / bin_size) row_bins = int((4689471.1780792 - 1623579.449434373) / bin_size) partial_work = partial(traj_to_bin, col_bins=col_bins, row_bins=row_bins) with mp.Pool(processes=mp.cpu_count()) as p: res = list(tqdm(p.imap(partial_work, traj_data_list), total=len(traj_data_list))) print("-----------------------------") traj_data_df = [traj[["x", "y", "no_bin", "lon", "lat", "boat_id", "time_array"]] for traj in res] traj_data_df = pd.concat(traj_data_df, axis=0, ignore_index=True) bin_to_coord_df = traj_data_df.groupby( ["no_bin"]).median().reset_index().drop(["boat_id"], axis=1) # DataFrame tmp for finding POIs visit_count_df = find_save_visit_count_table( traj_data_df, bin_to_coord_df) unique_boat_count_df = find_save_unique_visit_count_table( traj_data_df, bin_to_coord_df) mean_stay_time_df = find_save_mean_stay_time_table( traj_data_df, bin_to_coord_df) candidate_pois = visit_count_df.query( "visit_count >= {}".format(visit_count_minimum)).reset_index(drop=True) candidate_pois = pd.merge( candidate_pois, unique_boat_count_df, on="no_bin", how="left") candidate_pois = candidate_pois.query( "visit_boat_count >= {}".format(visit_boat_minimum)).reset_index(drop=True) candidate_pois = pd.merge( candidate_pois, mean_stay_time_df, on="no_bin", how="left") candidate_pois = candidate_pois.query( "mean_stay_time >= {}".format(mean_stay_minutes)).reset_index(drop=True) candidate_pois = pd.merge( candidate_pois, bin_to_coord_df, on="no_bin", how="left") candidate_pois.drop(["time_array"], axis=1, inplace=True) clf = DBSCAN(eps=1500, min_samples=200, n_jobs=-1, algorithm="kd_tree") candidate_pois["label"] = clf.fit_predict(candidate_pois[["x", "y"]].values, sample_weight=candidate_pois["visit_count"].values) pois = candidate_pois[candidate_pois["label"] != -1] pois.to_csv(".//tcdata_tmp//pois.csv", index=False) # Labeling fishing ground fishing_ground = load_fishing_ground() fishing_ground_polygons = fishing_ground["arr"].values.tolist() print("\n********************") print("@AIS preprocessing start at: {}".format(datetime.now())) print("********************") partial_work = partial(find_fishing_ground, poly_vert_list=fishing_ground_polygons) with mp.Pool(processes=mp.cpu_count()) as p: tmp = list(tqdm(p.imap(partial_work, traj_data_list), total=len(traj_data_list))) print("\n********************") print("@AIS preprocessing ended at: {}".format(datetime.now())) print("********************") traj_data_semantic = tmp # Spliting the training and testing data train_data = traj_data_semantic[:train_nums] test_data = traj_data_semantic[train_nums:(train_nums+test_nums)] ais_data = traj_data_semantic[(train_nums+test_nums):] # Save all data and concat the training and testing data print("\n@Semantic labeling results:") print("-----------------------------") print("#training: {}, #testing A: {}, #AIS: {}.".format( len(train_data), len(test_data), len(ais_data))) print("-----------------------------\n") file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//train_semantic_tmp.pkl", data=train_data) file_processor.save_data(path=".//tcdata_tmp//test_semantic_tmp.pkl", data=test_data) file_processor.save_data(path=".//tcdata_tmp//ais_semantic_tmp.pkl", data=ais_data) file_processor.save_data(path=".//tcdata_tmp//pois.pkl", data=pois) return candidate_pois, pois
def load_data(path_name=None): """Loading *.pkl from path_name, path_name is like: .//data//mnist.pkl""" file_processor = LoadSave() return file_processor.load_data(path=path_name)
def load_fishing_ground(): file_processor = LoadSave() data = file_processor.load_data(".//tcdata//fishing_ground.pkl") return data
def embedding_signal_sequence(speed_embedding=True, dir_embedding=True, speed_dir_embedding=True, speed_filter_stops=False, dir_filter_stops=True): """Training the signal embedding.""" train_data, test_data = load_train(), load_test() traj_data_all = train_data + test_data train_nums = len(train_data) boat_id = [traj["boat_id"].unique()[0] for traj in traj_data_all] total_embedding = pd.DataFrame(boat_id, columns=["boat_id"]) # Step 1: Construct the words traj_data_corpus = [] for traj in traj_data_all: traj["speed_str"] = traj["speed"].apply(lambda x: str(int(x * 100))) traj["direction_str"] = traj["direction"].apply(str) if speed_filter_stops: traj["speed_str"][traj["is_stop"] != -1] = "0" if dir_filter_stops: traj["direction_str"][traj["is_stop"] != -1] = "0" traj["speed_dir_str"] = traj["speed_str"] + "_" + traj["direction_str"] traj_data_corpus.append( traj[["boat_id", "speed_str", "direction_str", "speed_dir_str"]]) # traj_data_corpus = [] # for traj in traj_data_all: # lon_val, lat_val = traj["lon"].values, traj["lat"].values # angle = get_angle_from_coordinate(lat_val[1:], lon_val[1:], # lat_val[:-1], lon_val[:-1]).tolist() # angle = [angle[0]] + angle # # traj["speed_str"] = traj["speed"].apply(lambda x: str(int(x*100))) # traj["direction"] = angle # traj["direction_str"] = traj["direction"].apply(str) # if speed_filter_stops: # traj["speed_str"][traj["is_stop"] != -1] = "0" # if dir_filter_stops: # traj["direction_str"][traj["is_stop"] != -1] = "0" # # traj["speed_dir_str"] = traj["speed_str"] + "_" + traj["direction_str"] # traj_data_corpus.append(traj[["boat_id", "speed_str", # "direction_str", "speed_dir_str"]]) # Step 2: Training the speed information if speed_embedding: print("\n@Round 2 speed embedding:") print("-----------------------------") df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=10, iters=40, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="speed_str") speed_embedding = df_list[0].reset_index(drop=True) total_embedding = pd.merge(total_embedding, speed_embedding, on="boat_id", how="left") print("-----------------------------\n") # Step 3: Training the direcntion embedding if dir_embedding: print("\n@Round 2 direction embedding:") print("-----------------------------") df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=8, iters=40, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="direction_str") dir_embedding = df_list[0].reset_index(drop=True) total_embedding = pd.merge(total_embedding, dir_embedding, on="boat_id", how="left") print("-----------------------------\n") # Step 4: Training the speed-direcntion embedding if speed_dir_embedding: print("\n@Round 2 speed_dir embedding:") print("-----------------------------") df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=12, iters=70, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="speed_dir_str") speed_dir_embedding = df_list[0].reset_index(drop=True) total_embedding = pd.merge(total_embedding, speed_dir_embedding, on="boat_id", how="left") print("-----------------------------") # Step 5: Svaing the embedding vectorss train_embedding = total_embedding.iloc[:train_nums].reset_index(drop=True) test_embedding = total_embedding.iloc[train_nums:].reset_index(drop=True) file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//train_signal_embedding.pkl", data=train_embedding) file_processor.save_data(path=".//tcdata_tmp//test_signal_embedding.pkl", data=test_embedding)
def load_pkl(file_name=None): """Loading *.pkl from the path .//cached_data//""" file_processor = LoadSave() return file_processor.load_data( path=".//cached_data//{}".format(file_name))
"""Load the original *.csv data.""" total_name = path_name + file_name csv_data = pd.read_csv(total_name, nrows=nrows) return csv_data if __name__ == "__main__": train_df = load_csv(file_name="train.csv", path_name=".//data//", nrows=None) test_df = load_csv(path_name=".//data//", file_name="evaluation_public.csv", nrows=None) total_df = pd.concat([train_df, test_df], axis=0) # Encoding category variables # -------------------------------- cat_list = ["STATUS", "QUEUE_TYPE", "PLATFORM", "RESOURCE_TYPE"] for name in cat_list: total_df[name] = total_df[name].astype("category").cat.codes total_df.sort_values(by=["QUEUE_ID", "DOTTING_TIME"], ascending=True, inplace=True) total_df.reset_index(drop=True, inplace=True) # Save data to local path # -------------------------------- file_processor = LoadSave() file_processor.save_data(path=".//cached_data//total_df.pkl", data=total_df)
def stat_feature_engineering_xgb(): train_data = load_data("train_semantic.pkl") test_data_a = load_data("test_semantic.pkl") train_nums = len(train_data) total_data = train_data + test_data_a boat_id = [traj["boat_id"].unique()[0] for traj in total_data] labels = [traj["type"].unique()[0] for traj in train_data] total_features = pd.DataFrame(None) total_features["boat_id"] = boat_id # Step 1: coordinate stat features. with mp.Pool(processes=mp.cpu_count()) as p: tmp = list( tqdm(p.imap(traj_coord_stat, total_data), total=len(total_data))) coord_features = pd.concat(tmp, axis=0, ignore_index=True) coord_features["boat_id"] = boat_id total_features = pd.merge(total_features, coord_features, on="boat_id", how="left") # Step 2: speed stat features. with mp.Pool(processes=mp.cpu_count()) as p: tmp = list( tqdm(p.imap(traj_speed_stat, total_data), total=len(total_data))) speed_features = pd.concat(tmp, axis=0, ignore_index=True) speed_features["boat_id"] = boat_id total_features = pd.merge(total_features, speed_features, on="boat_id", how="left") # Step 4: expert features. with mp.Pool(processes=mp.cpu_count()) as p: tmp = list(tqdm(p.imap(traj_expert, total_data), total=len(total_data))) expert_features = pd.concat(tmp, axis=0, ignore_index=True) expert_features["boat_id"] = boat_id total_features = pd.merge(total_features, expert_features, on="boat_id", how="left") # Step 5: Concat the speed_dir embedding vector dir_embedding = traj_data_direction_embedding(total_data, embedding_size=8, iters=70, window_size=20, min_count=3) total_features = pd.merge(total_features, dir_embedding, on="boat_id", how="left") speed_embedding = traj_data_speed_embedding(total_data, embedding_size=10, iters=70, window_size=20, min_count=3) total_features = pd.merge(total_features, speed_embedding, on="boat_id", how="left") # # Step 7: speed tfidf # speed_tfidf = traj_data_speed_tfidf(total_data, max_features=40) # total_features = pd.merge(total_features, speed_tfidf, on="boat_id", # how="left") # # Step 8: GEO tfidf # bin_tfidf = traj_data_bin_tfidf(total_data, max_features=70) # total_features = pd.merge(total_features, bin_tfidf, on="boat_id", # how="left") ################################################## train_feature = total_features.iloc[:train_nums].reset_index( drop=True).copy() test_feature = total_features.iloc[train_nums:].reset_index( drop=True).copy() train_feature["target"] = labels print("\n--Train samples: {}, testA samples: {}.".format( len(train_feature), len(test_feature))) print("--Train cols: {}, test cols: {}.".format(train_feature.shape[1], test_feature.shape[1])) print("--Unique train cols: {}, unique testA cols: {}.\n".format( len(np.unique(train_feature.columns)), len(np.unique(test_feature.columns)))) file_processor = LoadSave() file_processor.save_data(path=".//tcdata_tmp//train_feature_xgb.pkl", data=train_feature) file_processor.save_data(path=".//tcdata_tmp//train_target.pkl", data=train_feature[["boat_id", "target"]]) file_processor.save_data(path=".//tcdata_tmp//test_feature_xgb.pkl", data=test_feature) gc.collect()
def load_data(name=None): """Loading *.pkl data from .//tcdata_tmp//""" assert name is not None, "Invalid file name!" file_processor = LoadSave() return file_processor.load_data(path=".//tcdata_tmp//{}".format(name))
search_res, acc_list = {}, [] print("\n[INFO] DATASET NAME: {}".format(name)) for ts_ind in tqdm(selected_ts_ind): ts_query = data_norm[ts_ind] search_res[ts_ind] = search_top_n_similar_ts(ts_query, data_norm, n=KEEP_TOP_N, use_lb_kim=USE_LB_KIM) if CHECK_1NN_ACC: one_nn_label = data_label[search_res[ts_ind] ["top_n_searching_res"][0][1]] one_nn_dist = search_res[ts_ind]["top_n_searching_res"][0][0] true_label = data_label[ts_ind] acc_list.append(one_nn_label == true_label) # STEP 3: Save the SEARCH_TOP_K results in experiment_res experiment_total_res[name] = search_res if CHECK_1NN_ACC: print("\n[INFO] Mean 1-NN accuracy: {:.5f}.".format( np.mean(acc_list))) if SAVE_EXPERIMENT_RESULTS: file_processor = LoadSave() new_file_path = ".//data_tmp//{}_baseline_top_{}.pkl".format( TARGET_DATASET_NAME, KEEP_TOP_N) print("\n") file_processor.save_data(path=new_file_path, data=experiment_total_res)