def segmentation(self, folder_dataset, folder_segments, min_points = 10): """ Receives the raw data from GeoLife dataset and segmentates it, saving it to the chosen folder. Parameters ---------- folder_dataset : str absolute path where the dataset is folder_segments : str absolute path where to save the segments min_points : int segments with less than <min_point> will be ignored Returns ------- no value """ ## get user's path who have transportation mode information # (i.e., only the users with the "labels.txt") label_users = get_files(folder_dataset, "labels.txt", True) n = len(label_users) for enum, current_label in enumerate(label_users): print("{:02d} of {} users -- processing user {}".format(enum + 1, n, dirname(current_label))) # read label information df_label = self._read_label(current_label) ## read user's trajectories data user_folder = dirname(current_label) user_name = basename(user_folder) user_files = get_files(user_folder, ".plt", True) for current_trajectory in user_files: df_user = self._read_trajectory(current_trajectory) trajectories = self._get_segments(df_user, df_label, min_points) for transport, trajectory in trajectories: self._save_segments(transport, trajectory, folder_segments, user_name)
def _get_data(self, transportation): """ Reads the motion files and organized them in a single dataset: X for features and y for labels. Parameters ---------- transportation : list of str list of transportation mode name used to classification Returns ------- X : pandas dataframe dataframe of features length: motion features * parameter y : pandas dataframe class labels """ X = pd.DataFrame() y = pd.DataFrame() motion = MotionDataset() for transport in transportation: file_name = transport + "*" + ".csv" path_transport = get_files(self.folder_features, file_name, True) feature_df = motion.build_dataset(self.motion_features, path_transport) feature_df = feature_df[:self.n_samples] concat1 = [X, feature_df] X = pd.concat(concat1, axis=0, ignore_index=True) # labels motion_class = pd.DataFrame([transport] * len(feature_df)) concat2 = [y, motion_class] y = pd.concat(concat2, axis=0, ignore_index=True) print("#### Motion features size: {}".format(len(X.columns))) print("size", len(X), len(y)) return X, y
def get_features(self, transportation, folder_segments, folder_features, motion_features): """ Receives the data about the segments, organizes them in a list and extract the features from them, saving it to the chosen folder. Parameters ---------- transportation : list of str transportation modes which we want to extract features folder_segments : str absolute path where the segments to extract features are folder_features : str the folder where to save the features Returns ------- no value """ segment_files = [] ## Get the path to the transportation mode files (all of them) for transport in transportation: query = transport + "*.csv" user_transportation = get_files(folder_segments, query, True) segment_files = list(chain(segment_files, user_transportation)) print("Processing {} segments...".format(len(segment_files))) for enum, segment in enumerate(segment_files): print("{} out of {} - {}".format(enum + 1, len(segment_files), segment)) df_features = GeoLifeFeaturesExtraction.get_features( segment, motion_features) ## save data self._save_features(df_features, folder_features, segment)
def _save_segments(self, transport_name, segment, folder_segments, user_name): """ Saves the segments extracted from trajectory. Parameters ---------- transport_name : str the transportation used in the trajectory segment : pandas dataframe the segment to save path_to_save : str absolute path where to save the segments user_name : str the user's name Returns ------- no value """ path_to_save = join(folder_segments, user_name) ## write to file previous_traj = get_files(path_to_save, transport_name, False) k = len(previous_traj) + 1 ## where to save path_to_save = join(path_to_save, transport_name, "") create_folder(path_to_save) ## name to save file_name = "{}_{:03d}.csv".format(transport_name, k) save_in = join(path_to_save, file_name) segment.to_csv(save_in, sep = ",", header = True, index = None)
def _get_data(self, transportation, parameter): """ Reads the OP Transformation files and organized them in a single dataset: X for features and y for labels. Parameters ---------- transportation : list of str list of transportation mode name used to classification parameter : list of int list of OP parameters: D and tau Returns ------- X : pandas dataframe dataframe of features y : pandas dataframe class labels """ D, tau = parameter op_values = "_D" + str(D) + "_t" + str(tau) + ".csv" features_name = [m + op_values for m in self.motion_features] # ex: 'distance_D3_t1.csv' X = pd.DataFrame() y = pd.DataFrame() print("### OP features: {}".format(self.op_features)) for transport in transportation: # path to op transformation files # ex: query = 'op_bus_distance_D3_t1.csv' # ex: op_files = 'db/GeoLife/op_features/op_bus_distance_D3_t1.csv' file_name = "op_" + transport + "_" query = [file_name + f for f in features_name] op_files = [self.folder_op + q for q in query] df_transport_op = pd.DataFrame() for file in op_files: op_csv = pd.read_csv(file, usecols=self.op_features) op_csv = op_csv[self.op_features] # to assure order # axis = 1 is by column, axis = 0 is by rows concat = [df_transport_op, op_csv] df_transport_op = pd.concat(concat, axis=1, ignore_index=True) df_transport_op = df_transport_op.dropna() ### motion_features motion = MotionDataset() file_name = transport + "*" + ".csv" path_transport = get_files(self.folder_features, file_name, True) feature_df = motion.build_dataset(self.motion_features, path_transport) concat2 = [df_transport_op, feature_df] df_transport_op = pd.concat(concat2, axis=1, ignore_index=True) df_transport_op = df_transport_op[:self.n_samples] # features concat1 = [X, df_transport_op] X = pd.concat(concat1, axis=0, ignore_index=True) # labels op_class = pd.DataFrame([transport] * len(df_transport_op)) concat2 = [y, op_class] y = pd.concat(concat2, axis=0, ignore_index=True) n = len(X.columns) print("#### OP features size: {}".format(n)) print("size", len(X), len(y)) return X, y
def get_transformation(self, parameters, motion_features, op_features, transportation, folder_features, folder_op): """ From each motion feature, calculates OP transformation, saving it to the chosen folder. The feature in dataset is a column, but our OP functions are implemented to rows, so we transpose each feature Parameters ---------- parameters : list of lists (int, int) OP parameters (D, tau) we want to extract motion_features : list of str list of motion features we want to transform op_features : list of str which features to extract from OP and OPTN transformation transportation : list of str list of transportation modes which will be transformed folder_features : str absolute path where the motion features that will be transformed are folder_op : str the folder where to save the segments Returns ------- no value """ for D, tau in parameters: print('\n for D = {} and tau = {} \n'.format(D, tau)) for motion_feature in motion_features: for transport in transportation: query = transport + "*.csv" user_files = get_files(folder_features, query, True) # dict of lists keys = op_features df_op = {k: [] for k in keys} print("Motion Feature: {} and transportation: {}".format( motion_feature, transport)) for file in user_files: df = pd.read_csv(file, sep=",", header=0, usecols=[motion_feature]) df.dropna(inplace=True) # we have columns, but the op function gets a row df_transposed = df.T # transposed # we put [0] because we have # pe, sc, fi = [(2.31, 0.23, 0.045)] (does not work) # and we want pe, sc, fi = (2.31, 0.23, 0.045) # feature_list = self._op_multithread( df_transposed, D, tau, op_features)[0] for op, feat in zip(op_features, feature_list): df_op[op].append(feat) df_op = pd.DataFrame.from_dict(df_op) ## save data op_values = "_D" + str(D) + "_t" + str(tau) file_name = "op_" + transport + "_" + motion_feature + op_values + ".csv" self._save_op_transformation(df_op, folder_op, file_name)