def encode(self, uid, trajectories, negative_sample=None): """standard encoder use the same method as DeepMove Recode poi id. Encode timestamp with its hour. Args: uid ([type]): same as AbstractTrajectoryEncoder trajectories ([type]): same as AbstractTrajectoryEncoder trajectory1 = [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ..... ] """ # 直接对 uid 进行重编码 uid = self.uid self.uid += 1 encoded_trajectories = [] for index, traj in enumerate(trajectories): current_loc = [] current_tim = [] current_longi = [] current_lati = [] current_points = [] start_time = parse_time(traj[0][2]) # 以当天凌晨的时间作为计算 time_off 的基准 base_time = cal_basetime(start_time, True) for point in traj: loc = point[4] now_time = parse_time(point[2]) if loc not in self.location2id: self.location2id[loc] = self.loc_id self.loc_id += 1 current_points.append(loc) current_loc.append(self.location2id[loc]) current_lati.append(self.geo_coord[loc][0]) current_longi.append(self.geo_coord[loc][1]) time_code = int(cal_timeoff(now_time, base_time)) if time_code > self.tim_max: self.tim_max = time_code current_tim.append(time_code) # 完成当前轨迹的编码,下面进行输入的形成 trace = [] target = current_loc[-1] target_tim = current_tim[-1] current_loc = current_loc[:-1] current_tim = current_tim[:-1] lati = self.geo_coord[current_points[-1]][0] lati = np.array([lati for i in range(len(current_loc))]) longi = self.geo_coord[current_points[-1]][1] longi = np.array([longi for i in range(len(current_loc))]) current_dis = euclidean_dist(lati - current_lati[:-1], longi - current_longi[:-1]) trace.append(current_loc) trace.append(current_tim) trace.append(target) trace.append(target_tim) trace.append(uid) trace.append(current_dis) encoded_trajectories.append(trace) return encoded_trajectories
def _cal_mat1(self, current_tim): # calculate the temporal relation matrix mat = np.zeros((self.max_len, self.max_len)) cur_len = len(current_tim) for i in range(cur_len): for j in range(cur_len): off = abs(cal_timeoff(current_tim[i], current_tim[j])) mat[i][j] = off if off > self.ex[3]: self.ex[3] = off return mat
def cutter_filter(self): """ 切割后的轨迹存储格式: (dict) { uid: [ [ checkin_record, checkin_record, ... ], [ checkin_record, checkin_record, ... ], ... ], ... } """ # load data according to config traj = pd.read_csv(os.path.join( self.data_path, '{}.dyna'.format(self.config['dataset']))) # filter inactive poi group_location = traj.groupby('location').count() filter_location = group_location[group_location['time'] >= self.config['min_checkins']] location_index = filter_location.index.tolist() traj = traj[traj['location'].isin(location_index)] user_set = pd.unique(traj['entity_id']) res = {} min_session_len = self.config['min_session_len'] max_session_len = self.config['max_session_len'] min_sessions = self.config['min_sessions'] window_size = self.config['window_size'] cut_method = self.config['cut_method'] if cut_method == 'time_interval': # 按照时间窗口进行切割 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = traj[traj['entity_id'] == uid].to_numpy() sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 for index, row in enumerate(usr_traj): now_time = parse_time(row[2]) if index == 0: session.append(row.tolist()) prev_time = now_time else: time_off = cal_timeoff(now_time, prev_time) if time_off < window_size and time_off >= 0 and len(session) < max_session_len: session.append(row.tolist()) else: if len(session) >= min_session_len: sessions.append(session) session = [] session.append(row.tolist()) prev_time = now_time if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[str(uid)] = sessions elif cut_method == 'same_date': # 将同一天的 check-in 划为一条轨迹 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = traj[traj['entity_id'] == uid].to_numpy() sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 prev_date = None for index, row in enumerate(usr_traj): now_time = parse_time(row[2]) now_date = now_time.day if index == 0: session.append(row.tolist()) else: if prev_date == now_date and len(session) < max_session_len: # 还是同一天 session.append(row.tolist()) else: if len(session) >= min_session_len: sessions.append(session) session = [] session.append(row.tolist()) prev_date = now_date if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[str(uid)] = sessions else: # cut by fix window_len used by STAN if max_session_len != window_size: raise ValueError('the fixed length window is not equal to max_session_len') for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = traj[traj['entity_id'] == uid].to_numpy() sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 for index, row in enumerate(usr_traj): if len(session) < window_size: session.append(row.tolist()) else: sessions.append(session) session = [] session.append(row.tolist()) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[str(uid)] = sessions return res
def _load_dyna(self): """ 轨迹存储格式: (dict) { uid: [ [ dyna_record, dyna_record, ... ], [ dyna_record, dyna_record, ... ], ... ], ... } """ # load data according to config dyna_file = pd.read_csv(os.path.join( self.data_path, '{}.dyna'.format(self.config['dataset']))) self._logger.info("Loaded file " + self.config['dataset'] + '.dyna, shape=' + str(dyna_file.shape)) self.dyna_feature_column = {col: i for i, col in enumerate(dyna_file)} res = dict() if self.need_cut: user_set = pd.unique(dyna_file['entity_id']) min_session_len = self.config['min_session_len'] max_session_len = self.config['max_session_len'] min_sessions = self.config['min_sessions'] window_size = self.config['window_size'] cut_method = self.config['cut_method'] if cut_method == 'time_interval': # 按照时间窗口进行切割 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = dyna_file[dyna_file['entity_id'] == uid] usr_traj = usr_traj.sort_values(by='time') usr_traj = usr_traj.reset_index(drop=True) sessions = [] # 存放该用户所有的 session traj_id = 0 session = [] # 单条轨迹 for index, row in usr_traj.iterrows(): row['traj_id'] = traj_id now_time = parse_time(row['time']) if index == 0: session.append(row.tolist()) prev_time = now_time else: time_off = cal_timeoff(now_time, prev_time) if time_off < window_size and time_off >= 0 and len(session) < max_session_len: session.append(row.tolist()) else: if len(session) >= min_session_len: sessions.append(session) traj_id += 1 session = [] session.append(row.tolist()) prev_time = now_time if len(session) >= min_session_len: sessions.append(session) traj_id += 1 if len(sessions) >= min_sessions: res[str(uid)] = sessions elif cut_method == 'same_date': # 将同一天的 check-in 划为一条轨迹 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = dyna_file[dyna_file['entity_id'] == uid] usr_traj = usr_traj.sort_values(by='time') usr_traj = usr_traj.reset_index(drop=True) sessions = [] # 存放该用户所有的 session traj_id = 0 session = [] # 单条轨迹 prev_date = None for index, row in usr_traj.iterrows(): row['traj_id'] = traj_id now_time = parse_time(row['time']) now_date = now_time.day if index == 0: session.append(row.tolist().append()) else: if prev_date == now_date and len(session) < max_session_len: session.append(row.tolist()) else: if len(session) >= min_session_len: sessions.append(session) traj_id += 1 session = [] session.append(row.tolist()) prev_date = now_date if len(session) >= min_session_len: sessions.append(session) traj_id += 1 if len(sessions) >= min_sessions: res[str(uid)] = sessions else: # cut by fix window_len used by STAN if max_session_len != window_size: raise ValueError('the fixed length window is not equal to max_session_len') for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = dyna_file[dyna_file['entity_id'] == uid] usr_traj = usr_traj.sort_values(by='time') usr_traj = usr_traj.reset_index(drop=True) sessions = [] # 存放该用户所有的 session traj_id = 0 session = [] # 单条轨迹 for index, row in usr_traj.iterrows(): row['traj_id'] = traj_id if len(session) < window_size: session.append(row.tolist()) else: sessions.append(session) traj_id += 1 session = [] session.append(row.tolist()) if len(session) >= min_session_len: sessions.append(session) traj_id += 1 if len(sessions) >= min_sessions: res[str(uid)] = sessions else: id_set = set() for dyna in dyna_file.itertuples(): entity_id = getattr(dyna, "entity_id") traj_id = getattr(dyna, "traj_id") if (entity_id, traj_id) in id_set: continue id_set.add((entity_id, traj_id)) if entity_id not in res: res[entity_id] = [] rows = dyna_file[(dyna_file['entity_id'] == entity_id) & (dyna_file['traj_id'] == traj_id)] rows = rows.sort_values(by='time') traj = [] for _, row in rows.iterrows(): traj.append(row.tolist()) res[entity_id].append(traj[:]) return res
def cutter_filter(self): """ 切割后的轨迹存储格式: (dict) { uid: [ [ checkin_record, checkin_record, ... ], [ checkin_record, checkin_record, ... ], ... ], ... } """ # load data according to config traj = pd.read_csv( os.path.join(self.data_path, '{}.dyna'.format(self.config['dataset']))) # filter inactive poi group_location = traj.groupby('location').count() filter_location = group_location[ group_location['time'] > self.config['min_checkins']] location_index = filter_location.index.tolist() traj = traj[traj['location'].isin(location_index)] user_set = pd.unique(traj['entity_id']) res = {} min_session_len = self.config['min_session_len'] min_sessions = self.config['min_sessions'] window_size = self.config['window_size'] window_type = self.config['window_type'] if window_type == 'time_window': # 按照时间窗口进行切割 base_zero = window_size > 12 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 # 这里还是使用当地时间吧 start_time = parse_time(usr_traj.iloc[0]['time']) base_time = cal_basetime(start_time, base_zero) for index, row in usr_traj.iterrows(): if index == 0: assert start_time.hour - base_time.hour < window_size session.append(row.tolist()) else: now_time = parse_time(row['time']) time_off = cal_timeoff(now_time, base_time) if time_off < window_size and time_off >= 0: session.append(row.tolist()) else: if len(session) >= min_session_len: sessions.append(session) session = [] start_time = now_time base_time = cal_basetime(start_time, base_zero) session.append(row.tolist()) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: # update counter for s in sessions: for row in s: self.counter.update({row[4]: 1}) res[str(uid)] = sessions else: # 按照轨迹长度进行划分 for uid in tqdm(user_set, desc="cut and filter trajectory"): usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 for index, row in usr_traj.iterrows(): if len(session) < window_size: session.append(row.tolist()) else: sessions.append(session) session = [] session.append(row.tolist()) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: # update counter for s in sessions: for row in s: self.counter.update({row[4]: 1}) res[str(uid)] = sessions return res