Beispiel #1
0
def preprocessing_raw_csv(PATH=".//tcdata//hy_round2_train_20200225//",
                          local_file_name="train.pkl"):
    """Loading and processing all train csv data."""
    if PATH is None:
        raise ValueError("Invalid PATH !")
    file_names = sorted(os.listdir(PATH), key=lambda s: int(s.split(".")[0]))

    # Loading all trajectory data.
    traj_data = []
    for name in file_names:
        traj_data.append(pd.read_csv(PATH + name, encoding="utf-8"))

    # Processing each trajectory data.
    print("\n@Multi-processing RAW CSV started:")
    print("-----------------------------")
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(tqdm(p.imap(preprocessing_traj, traj_data),
                        total=len(traj_data)))
    print("-----------------------------")
    print("@Multi-processing RAW CSV ended, to the local file: {}.\n".format(
        local_file_name))
    traj_data = [item[0] for item in tmp]
    change_record = [item[1] for item in tmp]
    change_record = pd.DataFrame(change_record,
                                 columns=["speed_change", "coord_change"])

    # Saving processed data to the lcoal path with *.pkl format
    file_processor = LoadSave(PATH)
    file_processor.save_data(path=".//tcdata_tmp//{}".format(local_file_name),
                             data=traj_data)
    return change_record
def traj_data_train_test_split(train_ratio=0.85):
    """Split the training data into training dataset and testing dataset, This
       is for the online docker docker evaluation testing.
    """
    PATH = ".//tcdata//hy_round2_train_20200225_local//"
    file_names = sorted(os.listdir(PATH), key=lambda s: int(s.split(".")[0]))

    print("\n@Read all raw traj data started at: {}".format(datetime.now()))
    print("-----------------------------")
    with mp.Pool(processes = mp.cpu_count()) as p:
        traj_data_total = list(tqdm(p.imap(read_traj, file_names),
                                    total=len(file_names)))
    print("-----------------------------")
    print("@End at: {}".format(datetime.now()))

    # Map the Chinese labels into the numerics
    str_to_label = {"刺网": 2, "围网": 1, "拖网": 0}
    target = [traj["type"].unique()[0] for traj in traj_data_total]
    target = np.array([str_to_label[i] for i in target])

    train_index, test_index = traj_data_train_test_index_generation(
        train_ratio=train_ratio, n_samples=len(traj_data_total),
        target=target, method="stratified")
    traj_data_train = [traj_data_total[i] for i in train_index]
    traj_data_train_fnames = [file_names[i] for i in train_index]
    traj_data_test = [traj_data_total[i] for i in test_index]
    traj_data_test_fnames = [file_names[i] for i in test_index]

    train_target_dist = [target[i] for i in train_index]
    test_target_dist = [target[i] for i in test_index]
    print("@Total target distributions: {}".format(
        np.bincount(target)/len(target)))
    print("@Train distributions: {}".format(
        np.bincount(train_target_dist)/len(traj_data_train)))
    print("@Test distributions: {}".format(
        np.bincount(test_target_dist)/len(traj_data_test)))

    TEST_TARGET_PATH = ".//tcdata_tmp//"
    boat_id = [int(file_names[i].split(".")[0]) for i in test_index]
    df = pd.DataFrame({"boat_id": boat_id, "target": test_target_dist})
    file_processor = LoadSave()
    file_processor.save_data(data=df, path=TEST_TARGET_PATH+"test_target.pkl")

    TRAIN_DATA_PATH = ".//tcdata//hy_round2_train_20200225//"
    file_names = os.listdir(TRAIN_DATA_PATH)
    if len(file_names) != 0:
        raise ValueError("The dir is not empty ! Please remove all file ~~")
    for df, name in zip(traj_data_train, traj_data_train_fnames):
        df.to_csv(TRAIN_DATA_PATH + name, index=False, encoding="utf-8")

    TEST_PATH = ".//tcdata//hy_round2_testA_20200225//"
    file_names = os.listdir(TEST_PATH)
    if len(file_names) != 0:
        raise ValueError("The dir is not empty ! Please remove all files ~~")
    for df, name in zip(traj_data_test, traj_data_test_fnames):
        df.to_csv(TEST_PATH + name, index=False, encoding="utf-8")

    return traj_data_train, traj_data_test
def traj_data_signal_embedding():
    """Loading the embedding vectors."""
    file_processor = LoadSave()
    train_embedding = file_processor.load_data(
        path=".//tcdata_tmp//train_signal_embedding.pkl")
    test_embedding = file_processor.load_data(
        path=".//tcdata_tmp//test_signal_embedding.pkl")
    return pd.concat([train_embedding, test_embedding],
                     axis=0,
                     ignore_index=True)
def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the unique boat visit count of each bin."""
    unique_boat_count_df = traj_data_df.groupby(["no_bin"])["boat_id"].nunique().reset_index()
    unique_boat_count_df.rename({"boat_id":"visit_boat_count"}, axis=1, inplace=True)

    unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df,
                                         on="no_bin", how="left")
    file_processor = LoadSave()
    file_processor.save_data(data=unique_boat_count_df_save,
                             path=".//tcdata_tmp//bin_unique_boat_count_frequency.pkl")
    return unique_boat_count_df
def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the visit frequency of each bin."""
    visit_count_df = traj_data_df.groupby(["no_bin"]).count().reset_index()
    visit_count_df = visit_count_df[["no_bin", "x"]]
    visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True)

    visit_count_df_save = pd.merge(bin_to_coord_df, visit_count_df, on="no_bin", how="left")
    file_processor = LoadSave()
    file_processor.save_data(data=visit_count_df_save,
                             path=".//tcdata_tmp//bin_visit_count_frequency.pkl")
    return visit_count_df
Beispiel #6
0
def preprocessing_mnist(n_data_list=None):
    img_data = pd.read_csv("..//demo_dataset//mnist//train.csv", nrows=None)
    img_cols = [name for name in img_data.columns if "pixel" in name]
    img_data = img_data[img_cols].values / 255
    img_data_list = img_data.tolist()

    # Save the proprocessed data
    file_name = [".//data//mnist_{}.pkl".format(i) for i in n_data_list]
    file_processor = LoadSave()
    for ind, item in enumerate(n_data_list):
        tmp_img_data = img_data_list[:item]
        tmp_file_name = file_name[ind]
        file_processor.save_data(path=tmp_file_name, data=tmp_img_data)
def find_save_mean_stay_time_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the mean stay time of each bin."""
    mean_stay_time_df = traj_data_df.groupby(
        ["no_bin", "boat_id"])["time_array"].sum().reset_index()
    mean_stay_time_df.rename({"time_array":"total_stay_time"}, axis=1, inplace=True)
    mean_stay_time_df = mean_stay_time_df.groupby(
        ["no_bin"])["total_stay_time"].mean().reset_index()
    mean_stay_time_df.rename(
        {"total_stay_time":"mean_stay_time"}, axis=1, inplace=True)

    mean_stay_time_df_save = pd.merge(bin_to_coord_df, mean_stay_time_df,
                                      on="no_bin", how="left")
    file_processor = LoadSave()
    file_processor.save_data(data=mean_stay_time_df_save,
                             path=".//tcdata_tmp//bin_mean_stay_time.pkl")
    return mean_stay_time_df
def save_ais_traj_to_csv(ais=None,
                         round_to_print=50000,
                         local_file_name="ais.pkl"):
    """Save the trajectory according to the ais record with the csv format."""
    ais_id_list = ais["ais_id"].astype(int).values.tolist()

    # Split the DataFrame
    ais_traj_list = []
    head, tail = 0, 0
    print("\n@Split AIS and save the traj in *.csv format:")
    print("---------------------------------------")
    while (tail <= (len(ais_id_list) - 1)):
        if tail % round_to_print == 0:
            print("--Now tail is on {}, completed {:.2f}%.".format(
                tail, (tail + 1) / len(ais_id_list) * 100))
            print("--time is {}.\n".format(datetime.now()))
        if ais_id_list[head] == ais_id_list[tail]:
            tail += 1
        elif ais_id_list[head] != ais_id_list[tail]:
            ais_traj_list.append(ais.iloc[head:tail])
            head = tail
    ais_traj_list.append(ais.iloc[head:])
    print("---------------------------------------")

    # Coordinate transferring
    #    tmp = []
    #    for i in range(50):
    #        tmp.append(preprocessing_traj(ais_traj_list[i]))
    print("\n@AIS list index resetting:")
    print("---------------------------------------")
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(
            tqdm(p.imap(preprocessing_traj, ais_traj_list),
                 total=len(ais_traj_list)))
    print("---------------------------------------")
    print("@Save to the local file: {}.\n".format(local_file_name))
    traj_data = [item[0] for item in tmp if len(item[0]) > 1]
    change_record = [item[1] for item in tmp]
    change_record = pd.DataFrame(change_record,
                                 columns=["speed_change", "coord_change"])

    # Saving processed data to the lcoal path with *.pkl format
    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//{}".format(local_file_name),
                             data=traj_data)
    return traj_data, change_record
def preparing_traj_data_corpus(bin_size=600):
    """Preparing the training corpus for the traj2vec model."""
    # Loading all the data
    train_data = load_data("train_semantic.pkl")
    test_data = load_data("test_semantic.pkl")
    ais_data = load_data("ais_semantic.pkl")

    train_concat = load_data("train_semantic_concat.pkl")
    test_concat = load_data("test_semantic_concat.pkl")

    # Print statistics
    x_min = min(train_concat["x"].min(), test_concat["x"].min())
    x_max = max(train_concat["x"].max(), test_concat["x"].max())
    y_min = min(train_concat["y"].min(), test_concat["y"].min())
    y_max = max(train_concat["y"].max(), test_concat["y"].max())

    col_bins = int((x_max - x_min) / bin_size)
    row_bins = int((y_max - y_min) / bin_size)

    # Start cutting the traj to bins
    traj_total = train_data + test_data + ais_data
    res = []

    # Multi-processing for loop.
    partial_work = partial(traj_to_bin,
                           col_bins=col_bins,
                           row_bins=row_bins,
                           x_min=x_min,
                           x_max=x_max,
                           y_min=y_min,
                           y_max=y_max)
    with mp.Pool(processes=mp.cpu_count()) as p:
        res = list(
            tqdm(p.imap(partial_work, traj_total), total=len(traj_total)))

    unique_words = [traj["no_bin"].nunique() for traj in res]
    print("\n@Cutting results basic stat:")
    print("-----------------------------")
    print("@Mean uniques: {:.5f}, max: {}, median: {:.5f}, std: {:.5f}".format(
        np.mean(unique_words), np.max(unique_words), np.median(unique_words),
        np.std(unique_words)))
    print("-----------------------------\n")
    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//traj_data_corpus.pkl",
                             data=res)
Beispiel #10
0
def preprocessing_HAR(n_data_list=None):
    file_processor = LoadSave()
    har_dataset, har_dataset_label = file_processor.load_data(
        path=
        "..//demo_dataset//human_activity_recognition//human_activity_recognition.pkl"
    )
    har_dataset_label = np.array(har_dataset_label)

    # Shuffle the dataset
    ind = np.random.choice(np.arange(0, len(har_dataset_label)),
                           size=len(har_dataset_label),
                           replace=False)
    har_dataset = har_dataset[ind]
    har_dataset_label = har_dataset_label[ind]

    for ind in range(len(n_data_list)):
        if n_data_list[ind] is None:
            n_data_list[ind] = len(har_dataset)

    file_name = [
        ".//data//human_activity_recognition_{}.pkl".format(i)
        for i in n_data_list
    ]
    file_processor = LoadSave()
    for ind, item in enumerate(n_data_list):
        tmp_data = har_dataset[:item]
        tmp_data_label = har_dataset_label[:item]

        tmp_file_name = file_name[ind]
        file_processor.save_data(path=tmp_file_name,
                                 data=[tmp_data, tmp_data_label])
Beispiel #11
0
def preprocessing_turnout(n_data_list=None):
    file_processor = LoadSave()
    signal_data_list = []

    # Experiment fault data
    def liststr_to_listnumeric(list_str):
        return list(map(float, list_str.split(",")))

    signal_fault_data = pd.read_csv(
        "..//demo_dataset//turnout//fault_data.csv",
        nrows=None).query("error_code != 0").reset_index(drop=True)
    signal_fault_data["Phase_A"] = signal_fault_data["Phase_A"].apply(
        liststr_to_listnumeric)
    signal_fault_data["Phase_B"] = signal_fault_data["Phase_B"].apply(
        liststr_to_listnumeric)
    signal_fault_data["Phase_C"] = signal_fault_data["Phase_C"].apply(
        liststr_to_listnumeric)

    for i in range(len(signal_fault_data)):
        signal = [
            signal_fault_data["Phase_A"].iloc[i],
            signal_fault_data["Phase_B"].iloc[i],
            signal_fault_data["Phase_C"].iloc[i]
        ]
        signal_data_list.append(signal)

    # Operation fault data
    signal_data = file_processor.load_data(
        path="..//demo_dataset//turnout//chengdu5_raw_table.pkl")
    signal_anomaly_scores = file_processor.load_data(
        path="..//demo_dataset//turnout//chengdu5_anomaly_scores.pkl")
    signal_data = pd.merge(signal_data,
                           signal_anomaly_scores,
                           on=["device_id", "record_id"],
                           how="left")
    signal_data = signal_data.sort_values(
        by="if_score", ascending=False).reset_index(drop=True)

    for i in range(len(signal_data)):
        signal = [
            signal_data["phase_a"].iloc[i], signal_data["phase_b"].iloc[i],
            signal_data["phase_c"].iloc[i]
        ]
        signal_data_list.append(signal)

    # Save the proprocessed data
    for ind in range(len(n_data_list)):
        if n_data_list[ind] is None:
            n_data_list[ind] = len(signal_data_list)

    file_name = [
        ".//data//fault_turnout_current_{}.pkl".format(i) for i in n_data_list
    ]
    for ind, item in enumerate(n_data_list):
        tmp_signal_data = signal_data_list[:item]
        tmp_file_name = file_name[ind]
        file_processor.save_data(path=tmp_file_name, data=tmp_signal_data)
def traj_data_labeling_semantics():
    '''
    Step 1: Load all possible stop grids.
    '''
    traj_data_list, train_nums, test_nums = load_concat_train_test_ais()
    pois = poi_classification()

    '''
    Step 2: Find all candiate stop points.
    '''
    # Label all semantic points
    nn = NearestNeighbors(n_neighbors=1, radius=400)
    clf = nn.fit(pois[["x", "y"]].values)
    traj_data_semantic = label_traj_data_semantics(
        traj_data_list, clf, pois.drop(["x", "y"], axis=1))

    # Spliting the training and testing data
    train_data = traj_data_semantic[:train_nums]
    test_data = traj_data_semantic[train_nums:(train_nums+test_nums)]
    ais_data = traj_data_semantic[(train_nums+test_nums):]

    # Save all data and concat the training and testing data
    print("\n@Semantic labeling results:")
    print("-----------------------------")
    print("#training: {}, #testing: {}, #AIS: {}.".format(
          len(train_data), len(test_data), len(ais_data)))
    print("-----------------------------\n")

    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//train_semantic.pkl",
                             data=train_data)
    file_processor.save_data(path=".//tcdata_tmp//test_semantic.pkl",
                             data=test_data)
    file_processor.save_data(path=".//tcdata_tmp//ais_semantic.pkl",
                             data=ais_data)

    '''
    Step 3: Concat a list of traj data.
    '''
    concat_list_data(data_list=train_data,
                     local_file_name="train_semantic_concat.pkl")
    concat_list_data(data_list=test_data,
                     local_file_name="test_semantic_concat.pkl")
def traj_data_cbow_embedding_generating(embedding_size=70,
                                        iters=70,
                                        min_count=3,
                                        window_size=25,
                                        num_runs=1):
    traj_corpus = load_data("traj_data_corpus.pkl")
    train_nums = len(
        sorted(os.listdir(".//tcdata//hy_round2_train_20200225//"),
               key=lambda s: int(s.split(".")[0])))
    test_nums = len(
        sorted(os.listdir(".//tcdata//hy_round2_testA_20200225//"),
               key=lambda s: int(s.split(".")[0])))
    df_list, model_list = traj_cbow_embedding(traj_corpus,
                                              embedding_size=embedding_size,
                                              iters=iters,
                                              min_count=min_count,
                                              window_size=window_size,
                                              seed=9012,
                                              num_runs=num_runs,
                                              word_feat="no_bin")

    train_embedding_df_list = [
        df.iloc[:train_nums].reset_index(drop=True) for df in df_list
    ]
    test_embedding_df_list = [
        df.iloc[train_nums:(train_nums + test_nums)].reset_index(drop=True)
        for df in df_list
    ]

    file_processor = LoadSave()
    file_processor.save_data(
        path=".//tcdata_tmp//train_embedding_cbow_list.pkl",
        data=train_embedding_df_list)
    file_processor.save_data(
        path=".//tcdata_tmp//test_embedding_cbow_list.pkl",
        data=test_embedding_df_list)
Beispiel #14
0
            tmp_df.rename(
                {
                    "DOTTING_TIME": "DOTTING_TIME_SHIFT",
                    "QUEUE_ID": "QUEUE_ID_SHIFT",
                    feat_name: target_feat_name
                },
                axis=1,
                inplace=True)
            target_df = pd.concat([target_df, tmp_df], axis=1)

            # Exclude invalid target values
            target_df[target_feat_name][
                target_df["QUEUE_ID"] != target_df["QUEUE_ID_SHIFT"]] = np.nan
            target_df[target_feat_name][
                abs(target_df["DOTTING_TIME"] -
                    target_df["DOTTING_TIME_SHIFT"]) > tol_time_diff] = np.nan

            # Drop tmp columns
            target_df.drop(["DOTTING_TIME_SHIFT", "QUEUE_ID_SHIFT"],
                           axis=1,
                           inplace=True)

    # Save feat engineering results
    # ----------------------------
    file_processor = LoadSave()
    total_results = [feat_df, target_df]
    file_processor.save_data(
        path=".//cached_data//{}.pkl".format("nn_dense_feat"),
        data=total_results)
def load_data(name=None):
    """Load data from .//tcdata_tmp//"""
    file_processor = LoadSave()
    data = file_processor.load_data(path=".//tcdata_tmp//" + name)
    return data
    ##################################################
    train_feature = total_features.iloc[:train_nums].reset_index(
        drop=True).copy()
    test_feature = total_features.iloc[train_nums:].reset_index(
        drop=True).copy()
    train_feature["target"] = labels

    print("\n-- Train samples: {}, testA samples: {}.".format(
        len(train_feature), len(test_feature)))
    print("-- Train cols: {}, test cols: {}.".format(train_feature.shape[1],
                                                     test_feature.shape[1]))
    print("-- Unique train cols: {}, unique testA cols: {}.\n".format(
        len(np.unique(train_feature.columns)),
        len(np.unique(test_feature.columns))))
    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//train_feature_xgb.pkl",
                             data=train_feature)
    file_processor.save_data(path=".//tcdata_tmp//train_target.pkl",
                             data=train_feature[["boat_id", "target"]])
    file_processor.save_data(path=".//tcdata_tmp//test_feature_xgb.pkl",
                             data=test_feature)
    gc.collect()

    for embedding_id in [0, 1, 2]:
        xgb_res_list = xgb_clf_embedding_list_train(folds=5,
                                                    id_list=[embedding_id],
                                                    embedding_enable=True)
        df = training_res_to_log(training_res=xgb_res_list[0][0],
                                 comment="xgb_{}".format(embedding_id))
def traj_data_poi_mining(visit_count_minimum=200, visit_boat_minimum=3,
                         mean_stay_minutes=120, bin_size=800):
    '''
    Step 1: Find all possible stop grids.
    '''
    traj_data_list, train_nums, test_nums = load_concat_train_test_ais()

    print("\n@Step 1: traj2bin:")
    print("-----------------------------")
    col_bins = int((14226964.881853 - 12031967.16239096) / bin_size)
    row_bins = int((4689471.1780792 - 1623579.449434373) / bin_size)
    partial_work = partial(traj_to_bin, col_bins=col_bins, row_bins=row_bins)
    with mp.Pool(processes=mp.cpu_count()) as p:
        res = list(tqdm(p.imap(partial_work, traj_data_list),
                        total=len(traj_data_list)))
    print("-----------------------------")

    traj_data_df = [traj[["x", "y", "no_bin", "lon",
                          "lat", "boat_id", "time_array"]] for traj in res]
    traj_data_df = pd.concat(traj_data_df, axis=0, ignore_index=True)
    bin_to_coord_df = traj_data_df.groupby(
        ["no_bin"]).median().reset_index().drop(["boat_id"], axis=1)

    # DataFrame tmp for finding POIs
    visit_count_df = find_save_visit_count_table(
        traj_data_df, bin_to_coord_df)
    unique_boat_count_df = find_save_unique_visit_count_table(
        traj_data_df, bin_to_coord_df)
    mean_stay_time_df = find_save_mean_stay_time_table(
        traj_data_df, bin_to_coord_df)

    candidate_pois = visit_count_df.query(
        "visit_count >= {}".format(visit_count_minimum)).reset_index(drop=True)

    candidate_pois = pd.merge(
        candidate_pois, unique_boat_count_df, on="no_bin", how="left")
    candidate_pois = candidate_pois.query(
        "visit_boat_count >=  {}".format(visit_boat_minimum)).reset_index(drop=True)

    candidate_pois = pd.merge(
        candidate_pois, mean_stay_time_df, on="no_bin", how="left")
    candidate_pois = candidate_pois.query(
        "mean_stay_time >=  {}".format(mean_stay_minutes)).reset_index(drop=True)

    candidate_pois = pd.merge(
        candidate_pois, bin_to_coord_df, on="no_bin", how="left")
    candidate_pois.drop(["time_array"], axis=1, inplace=True)

    clf = DBSCAN(eps=1500, min_samples=200, n_jobs=-1, algorithm="kd_tree")
    candidate_pois["label"] = clf.fit_predict(candidate_pois[["x", "y"]].values,
        sample_weight=candidate_pois["visit_count"].values)
    pois = candidate_pois[candidate_pois["label"] != -1]
    pois.to_csv(".//tcdata_tmp//pois.csv", index=False)

    # Labeling fishing ground
    fishing_ground = load_fishing_ground()
    fishing_ground_polygons = fishing_ground["arr"].values.tolist()
    print("\n********************")
    print("@AIS preprocessing start at: {}".format(datetime.now()))
    print("********************")
    partial_work = partial(find_fishing_ground, poly_vert_list=fishing_ground_polygons)
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(tqdm(p.imap(partial_work, traj_data_list),
                        total=len(traj_data_list)))
    print("\n********************")
    print("@AIS preprocessing ended at: {}".format(datetime.now()))
    print("********************")
    traj_data_semantic = tmp

    # Spliting the training and testing data
    train_data = traj_data_semantic[:train_nums]
    test_data = traj_data_semantic[train_nums:(train_nums+test_nums)]
    ais_data = traj_data_semantic[(train_nums+test_nums):]

    # Save all data and concat the training and testing data
    print("\n@Semantic labeling results:")
    print("-----------------------------")
    print("#training: {}, #testing A: {}, #AIS: {}.".format(
          len(train_data), len(test_data), len(ais_data)))
    print("-----------------------------\n")

    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//train_semantic_tmp.pkl",
                             data=train_data)
    file_processor.save_data(path=".//tcdata_tmp//test_semantic_tmp.pkl",
                             data=test_data)
    file_processor.save_data(path=".//tcdata_tmp//ais_semantic_tmp.pkl",
                             data=ais_data)
    file_processor.save_data(path=".//tcdata_tmp//pois.pkl",
                             data=pois)

    return candidate_pois, pois
Beispiel #18
0
def load_data(path_name=None):
    """Loading *.pkl from path_name, path_name is like: .//data//mnist.pkl"""
    file_processor = LoadSave()
    return file_processor.load_data(path=path_name)
def load_fishing_ground():
    file_processor = LoadSave()
    data = file_processor.load_data(".//tcdata//fishing_ground.pkl")
    return data
def embedding_signal_sequence(speed_embedding=True,
                              dir_embedding=True,
                              speed_dir_embedding=True,
                              speed_filter_stops=False,
                              dir_filter_stops=True):
    """Training the signal embedding."""
    train_data, test_data = load_train(), load_test()

    traj_data_all = train_data + test_data
    train_nums = len(train_data)
    boat_id = [traj["boat_id"].unique()[0] for traj in traj_data_all]
    total_embedding = pd.DataFrame(boat_id, columns=["boat_id"])

    # Step 1: Construct the words
    traj_data_corpus = []
    for traj in traj_data_all:
        traj["speed_str"] = traj["speed"].apply(lambda x: str(int(x * 100)))
        traj["direction_str"] = traj["direction"].apply(str)
        if speed_filter_stops:
            traj["speed_str"][traj["is_stop"] != -1] = "0"
        if dir_filter_stops:
            traj["direction_str"][traj["is_stop"] != -1] = "0"

        traj["speed_dir_str"] = traj["speed_str"] + "_" + traj["direction_str"]
        traj_data_corpus.append(
            traj[["boat_id", "speed_str", "direction_str", "speed_dir_str"]])


#    traj_data_corpus = []
#    for traj in traj_data_all:
#        lon_val, lat_val = traj["lon"].values, traj["lat"].values
#        angle = get_angle_from_coordinate(lat_val[1:], lon_val[1:],
#                                          lat_val[:-1], lon_val[:-1]).tolist()
#        angle = [angle[0]] + angle
#
#        traj["speed_str"] = traj["speed"].apply(lambda x: str(int(x*100)))
#        traj["direction"] = angle
#        traj["direction_str"] = traj["direction"].apply(str)
#        if speed_filter_stops:
#            traj["speed_str"][traj["is_stop"] != -1] = "0"
#        if dir_filter_stops:
#            traj["direction_str"][traj["is_stop"] != -1] = "0"
#
#        traj["speed_dir_str"] = traj["speed_str"] + "_" + traj["direction_str"]
#        traj_data_corpus.append(traj[["boat_id", "speed_str",
#                                      "direction_str", "speed_dir_str"]])

# Step 2: Training the speed information
    if speed_embedding:
        print("\n@Round 2 speed embedding:")
        print("-----------------------------")
        df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                                  embedding_size=10,
                                                  iters=40,
                                                  min_count=3,
                                                  window_size=25,
                                                  seed=9102,
                                                  num_runs=1,
                                                  word_feat="speed_str")
        speed_embedding = df_list[0].reset_index(drop=True)
        total_embedding = pd.merge(total_embedding,
                                   speed_embedding,
                                   on="boat_id",
                                   how="left")
        print("-----------------------------\n")

    # Step 3: Training the direcntion embedding
    if dir_embedding:
        print("\n@Round 2 direction embedding:")
        print("-----------------------------")
        df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                                  embedding_size=8,
                                                  iters=40,
                                                  min_count=3,
                                                  window_size=25,
                                                  seed=9102,
                                                  num_runs=1,
                                                  word_feat="direction_str")
        dir_embedding = df_list[0].reset_index(drop=True)
        total_embedding = pd.merge(total_embedding,
                                   dir_embedding,
                                   on="boat_id",
                                   how="left")
        print("-----------------------------\n")

    # Step 4: Training the speed-direcntion embedding
    if speed_dir_embedding:
        print("\n@Round 2 speed_dir embedding:")
        print("-----------------------------")
        df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                                  embedding_size=12,
                                                  iters=70,
                                                  min_count=3,
                                                  window_size=25,
                                                  seed=9102,
                                                  num_runs=1,
                                                  word_feat="speed_dir_str")
        speed_dir_embedding = df_list[0].reset_index(drop=True)
        total_embedding = pd.merge(total_embedding,
                                   speed_dir_embedding,
                                   on="boat_id",
                                   how="left")
        print("-----------------------------")

    # Step 5: Svaing the embedding vectorss
    train_embedding = total_embedding.iloc[:train_nums].reset_index(drop=True)
    test_embedding = total_embedding.iloc[train_nums:].reset_index(drop=True)

    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//train_signal_embedding.pkl",
                             data=train_embedding)
    file_processor.save_data(path=".//tcdata_tmp//test_signal_embedding.pkl",
                             data=test_embedding)
def load_pkl(file_name=None):
    """Loading *.pkl from the path .//cached_data//"""
    file_processor = LoadSave()
    return file_processor.load_data(
        path=".//cached_data//{}".format(file_name))
    """Load the original *.csv data."""
    total_name = path_name + file_name
    csv_data = pd.read_csv(total_name, nrows=nrows)
    return csv_data


if __name__ == "__main__":
    train_df = load_csv(file_name="train.csv",
                        path_name=".//data//",
                        nrows=None)
    test_df = load_csv(path_name=".//data//",
                       file_name="evaluation_public.csv",
                       nrows=None)
    total_df = pd.concat([train_df, test_df], axis=0)

    # Encoding category variables
    # --------------------------------
    cat_list = ["STATUS", "QUEUE_TYPE", "PLATFORM", "RESOURCE_TYPE"]
    for name in cat_list:
        total_df[name] = total_df[name].astype("category").cat.codes
    total_df.sort_values(by=["QUEUE_ID", "DOTTING_TIME"],
                         ascending=True,
                         inplace=True)
    total_df.reset_index(drop=True, inplace=True)

    # Save data to local path
    # --------------------------------
    file_processor = LoadSave()
    file_processor.save_data(path=".//cached_data//total_df.pkl",
                             data=total_df)
def stat_feature_engineering_xgb():
    train_data = load_data("train_semantic.pkl")
    test_data_a = load_data("test_semantic.pkl")
    train_nums = len(train_data)

    total_data = train_data + test_data_a
    boat_id = [traj["boat_id"].unique()[0] for traj in total_data]
    labels = [traj["type"].unique()[0] for traj in train_data]
    total_features = pd.DataFrame(None)
    total_features["boat_id"] = boat_id

    # Step 1: coordinate stat features.
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(
            tqdm(p.imap(traj_coord_stat, total_data), total=len(total_data)))
    coord_features = pd.concat(tmp, axis=0, ignore_index=True)
    coord_features["boat_id"] = boat_id
    total_features = pd.merge(total_features,
                              coord_features,
                              on="boat_id",
                              how="left")

    # Step 2: speed stat features.
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(
            tqdm(p.imap(traj_speed_stat, total_data), total=len(total_data)))
    speed_features = pd.concat(tmp, axis=0, ignore_index=True)
    speed_features["boat_id"] = boat_id
    total_features = pd.merge(total_features,
                              speed_features,
                              on="boat_id",
                              how="left")

    # Step 4: expert features.
    with mp.Pool(processes=mp.cpu_count()) as p:
        tmp = list(tqdm(p.imap(traj_expert, total_data),
                        total=len(total_data)))
    expert_features = pd.concat(tmp, axis=0, ignore_index=True)
    expert_features["boat_id"] = boat_id
    total_features = pd.merge(total_features,
                              expert_features,
                              on="boat_id",
                              how="left")

    # Step 5: Concat the speed_dir embedding vector
    dir_embedding = traj_data_direction_embedding(total_data,
                                                  embedding_size=8,
                                                  iters=70,
                                                  window_size=20,
                                                  min_count=3)
    total_features = pd.merge(total_features,
                              dir_embedding,
                              on="boat_id",
                              how="left")

    speed_embedding = traj_data_speed_embedding(total_data,
                                                embedding_size=10,
                                                iters=70,
                                                window_size=20,
                                                min_count=3)
    total_features = pd.merge(total_features,
                              speed_embedding,
                              on="boat_id",
                              how="left")

    #    # Step 7: speed tfidf
    #    speed_tfidf = traj_data_speed_tfidf(total_data, max_features=40)
    #    total_features = pd.merge(total_features, speed_tfidf, on="boat_id",
    #                              how="left")

    #    # Step 8: GEO tfidf
    #    bin_tfidf = traj_data_bin_tfidf(total_data, max_features=70)
    #    total_features = pd.merge(total_features, bin_tfidf, on="boat_id",
    #                              how="left")

    ##################################################
    train_feature = total_features.iloc[:train_nums].reset_index(
        drop=True).copy()
    test_feature = total_features.iloc[train_nums:].reset_index(
        drop=True).copy()
    train_feature["target"] = labels

    print("\n--Train samples: {}, testA samples: {}.".format(
        len(train_feature), len(test_feature)))
    print("--Train cols: {}, test cols: {}.".format(train_feature.shape[1],
                                                    test_feature.shape[1]))
    print("--Unique train cols: {}, unique testA cols: {}.\n".format(
        len(np.unique(train_feature.columns)),
        len(np.unique(test_feature.columns))))
    file_processor = LoadSave()
    file_processor.save_data(path=".//tcdata_tmp//train_feature_xgb.pkl",
                             data=train_feature)
    file_processor.save_data(path=".//tcdata_tmp//train_target.pkl",
                             data=train_feature[["boat_id", "target"]])
    file_processor.save_data(path=".//tcdata_tmp//test_feature_xgb.pkl",
                             data=test_feature)
    gc.collect()
Beispiel #24
0
def load_data(name=None):
    """Loading *.pkl data from .//tcdata_tmp//"""
    assert name is not None, "Invalid file name!"
    file_processor = LoadSave()
    return file_processor.load_data(path=".//tcdata_tmp//{}".format(name))
Beispiel #25
0
        search_res, acc_list = {}, []
        print("\n[INFO] DATASET NAME: {}".format(name))
        for ts_ind in tqdm(selected_ts_ind):
            ts_query = data_norm[ts_ind]
            search_res[ts_ind] = search_top_n_similar_ts(ts_query,
                                                         data_norm,
                                                         n=KEEP_TOP_N,
                                                         use_lb_kim=USE_LB_KIM)

            if CHECK_1NN_ACC:
                one_nn_label = data_label[search_res[ts_ind]
                                          ["top_n_searching_res"][0][1]]
                one_nn_dist = search_res[ts_ind]["top_n_searching_res"][0][0]

                true_label = data_label[ts_ind]
                acc_list.append(one_nn_label == true_label)

        # STEP 3: Save the SEARCH_TOP_K results in experiment_res
        experiment_total_res[name] = search_res
        if CHECK_1NN_ACC:
            print("\n[INFO] Mean 1-NN accuracy: {:.5f}.".format(
                np.mean(acc_list)))

    if SAVE_EXPERIMENT_RESULTS:
        file_processor = LoadSave()
        new_file_path = ".//data_tmp//{}_baseline_top_{}.pkl".format(
            TARGET_DATASET_NAME, KEEP_TOP_N)

        print("\n")
        file_processor.save_data(path=new_file_path, data=experiment_total_res)