def encode(self, uid, trajectories): """standard encoder use the same method as DeepMove Recode poi id. Encode timestamp with its hour. Args: uid ([type]): same as AbstractTrajectoryEncoder trajectories ([type]): same as AbstractTrajectoryEncoder trajectory1 = [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ..... ] """ # 直接对 uid 进行重编码 uid = self.uid self.uid += 1 encoded_trajectories = [] history_loc = [] history_tim = [] for index, traj in enumerate(trajectories): current_loc = [] current_tim = [] current_longi = [] current_lati = [] current_points = [] start_time = parse_time(traj[0][1], traj[0][2]) # 以当天凌晨的时间作为计算 time_off 的基准 base_time = cal_basetime(start_time, True) for point in traj: loc = point[0] now_time = parse_time(point[1], point[2]) if loc not in self.location2id: self.location2id[loc] = self.loc_id self.loc_id += 1 current_points.append(loc) current_loc.append(self.location2id[loc]) current_lati.append(self.geo_coord[loc][0]) current_longi.append(self.geo_coord[loc][1]) time_code = int(cal_timeoff(now_time, base_time)) if time_code > self.tim_max: self.tim_max = time_code current_tim.append(time_code) # 完成当前轨迹的编码,下面进行输入的形成 trace = [] target = current_loc[-1] target_tim = current_tim[-1] current_loc = current_loc[:-1] current_tim = current_tim[:-1] lati = self.geo_coord[self.location2id[current_points[-1]]][0] lati = np.array([lati for i in range(len(current_loc))]) longi = self.geo_coord[self.location2id[current_points[-1]]][1] longi = np.array([longi for i in range(len(current_loc))]) current_dis = euclidean_dist(lati - current_lati[:-1], longi - current_longi[:-1]) trace.append(history_loc) trace.append(history_tim) trace.append(current_loc) trace.append(current_tim) trace.append(target) trace.append(target_tim) trace.append(uid) trace.append(current_dis) encoded_trajectories.append(trace) if self.history_type == 'splice': history_loc += current_loc history_tim += current_tim else: history_loc.append(current_loc) history_tim.append(current_tim) return encoded_trajectories
def encode(self, uid, trajectories): """standard encoder use the same method as DeepMove Recode poi id. Encode timestamp with its hour. Args: uid ([type]): same as AbstractTrajectoryEncoder trajectories ([type]): same as AbstractTrajectoryEncoder trajectory1 = [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ..... ] """ # 直接对 uid 进行重编码 uid = self.uid self.uid += 1 encoded_trajectories = [] history_loc = [] history_tim = [] for index, traj in enumerate(trajectories): current_loc = [] current_tim = [] start_time = parse_time(traj[0][1], traj[0][2]) # 以当天凌晨的时间作为计算 time_off 的基准 base_time = cal_basetime(start_time, True) for point in traj: loc = point[0] now_time = parse_time(point[1], point[2]) if loc not in self.location2id: self.location2id[loc] = self.loc_id self.loc_id += 1 current_loc.append(self.location2id[loc]) time_code = int(cal_timeoff(now_time, base_time)) if time_code > self.tim_max: self.tim_max = time_code current_tim.append(time_code) # 完成当前轨迹的编码,下面进行输入的形成 if index == 0: # 因为要历史轨迹特征,所以第一条轨迹是不能构成模型输入的 if self.history_type == 'splice': history_loc += current_loc history_tim += current_tim else: history_loc.append(current_loc) history_tim.append(current_tim) continue trace = [] target = current_loc[-1] target_tim = current_tim[-1] current_loc = current_loc[:-1] current_tim = current_tim[:-1] trace.append(history_loc) trace.append(history_tim) trace.append(current_loc) trace.append(current_tim) trace.append(target) trace.append(target_tim) trace.append(uid) encoded_trajectories.append(trace) if self.history_type == 'splice': history_loc += current_loc history_tim += current_tim else: history_loc.append(current_loc) history_tim.append(current_tim) return encoded_trajectories
def cutter_filter(self): """ 切割后的轨迹存储格式: (dict) { uid: [ [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ... ], [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ... ], ... ], ... } """ # load data according to config traj = pd.read_csv( os.path.join(self.data_path, '{}.dyna'.format(self.config['dataset']))) user_set = pd.unique(traj['entity_id']) res = {} min_session_len = self.config['min_session_len'] min_sessions = self.config['min_sessions'] window_size = self.config['window_size'] window_type = self.config['window_type'] if window_type == 'time_window': # 按照时间窗口进行切割 base_zero = window_size > 12 for uid in user_set: usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 # 这里还是使用当地时间吧 start_time = parse_time( usr_traj.iloc[0]['time'], int(usr_traj.iloc[0]['timezone_offset_in_minutes'])) base_time = cal_basetime(start_time, base_zero) for index, row in usr_traj.iterrows(): if index == 0: assert start_time.hour - base_time.hour < window_size session.append((row['location'], row['time'], row['timezone_offset_in_minutes'])) else: now_time = parse_time( row['time'], int(row['timezone_offset_in_minutes'])) time_off = cal_timeoff(now_time, base_time) if time_off < window_size and time_off >= 0: session.append((row['location'], row['time'], row['timezone_offset_in_minutes'])) else: if len(session) >= min_session_len: sessions.append(session) session = [] start_time = now_time base_time = cal_basetime(start_time, base_zero) session.append((row['location'], row['time'], row['timezone_offset_in_minutes'])) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[uid] = sessions else: # 按照轨迹长度进行划分 for uid in user_set: usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 for index, row in usr_traj.iterrows(): if len(session) < window_size: session.append((row['location'], row['time'], row['timezone_offset_in_minutes'])) else: sessions.append(session) session = [] session.append((row['location'], row['time'], row['timezone_offset_in_minutes'])) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[uid] = sessions return res
def cutter_filter(self): """ 切割后的轨迹存储格式: (dict) 还需要考虑语义信息,将每个点对应的语义信息加入进去 """ """ { uid: [ [ [loc, tim, [useful word list]], [loc, tim, [useful word list]], ... ], [ [loc, tim, [useful word list]], [loc, tim, [useful word list]], ... ], ... ], ... } """ # load data according to config traj = pd.read_csv( os.path.join(self.data_path, '{}.dyna'.format(self.config['dataset']))) poi = pd.read_csv( os.path.join(self.data_path, '{}.geo'.format(self.config['dataset']))) # 统计语料库中出现在轨迹数据集中的单词 useful_vec = {} text_vec = self.load_wordvec() # 加载语料库 user_set = pd.unique(traj['entity_id']) res = {} min_session_len = self.config['min_session_len'] min_sessions = self.config['min_sessions'] time_window_size = 24 # serm 论文的时间编码方式比较独特 base_zero = time_window_size > 12 useful_uid = 0 # 因为有些用户会被我们删除掉,所以需要对 uid 进行重新编号 useful_loc = {} # loc 同理 loc_id = 0 for uid in user_set: usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 # 这里还是使用当地时间吧 start_time = parse_time( usr_traj.iloc[0]['time'], int(usr_traj.iloc[0]['timezone_offset_in_minutes'])) base_time = cal_basetime(start_time, base_zero) for index, row in usr_traj.iterrows(): if index == 0: assert start_time.hour - base_time.hour < time_window_size # 处理第一个点的语义信息 useful_words_list = [] if self.config['dataset'] in [ 'foursquare_tky', 'foursquare_nyk' ]: # TODO: 这种硬编码可能不太好 words = poi.iloc[ row['location']]['venue_category_name'].split(' ') for w in words: w = w.lower() if (w in text_vec) and (w not in useful_vec): useful_vec[w] = text_vec[w] if w in useful_vec: useful_words_list.append(w) time_code = start_time.hour - base_time.hour if start_time.weekday() == 5 or start_time.weekday() == 6: time_code += 24 session.append( [row['location'], time_code, useful_words_list]) else: now_time = parse_time( row['time'], int(row['timezone_offset_in_minutes'])) time_off = cal_timeoff(now_time, base_time) # 处理语义 useful_words_list = [] if self.config['dataset'] in [ 'foursquare_tky', 'foursquare_nyk' ]: # TODO: 这种硬编码可能不太好 words = poi.iloc[ row['location']]['venue_category_name'].split(' ') for w in words: w = w.lower() if (w in text_vec) and (w not in useful_vec): useful_vec[w] = text_vec[w] if w in useful_vec: useful_words_list.append(w) if time_off < time_window_size and time_off >= 0: # 特殊的时间编码 time_code = int(time_off) if now_time.weekday() in [5, 6]: time_code += 24 assert int(time_off) < time_window_size session.append( [row['location'], time_code, useful_words_list]) else: if len(session) >= min_session_len: sessions.append(session) session = [] start_time = now_time base_time = cal_basetime(start_time, base_zero) time_code = start_time.hour - base_time.hour if start_time.weekday() in [5, 6]: time_code += 24 session.append( [row['location'], time_code, useful_words_list]) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: # 到这里才确定 sessions 里的 loc 都是会被使用到的 for i in range(len(sessions)): for j in range(len(sessions[i])): loc = sessions[i][j][0] if loc not in useful_loc: useful_loc[loc] = loc_id loc_id += 1 sessions[i][j][0] = useful_loc[loc] res[useful_uid] = sessions useful_uid += 1 # 这里的 uid_size 和 loc_size 可能要大于实际的 uid 和 loc,因为有些可能被过滤掉了 loc_size = loc_id uid_size = useful_uid # 根据 useful_vec 计算 word_vec word_index = {} word_vec = [] text_size = len(useful_vec) for i, w in enumerate(useful_vec.keys()): word_index[w] = i word_vec.append(useful_vec[w]) print('loc_size: {}, uid_size: {}, text_size: {}'.format( loc_size, uid_size, text_size)) return { 'loc_size': loc_size, 'tim_size': 48, 'uid_size': uid_size, 'text_size': text_size, 'word_vec': word_vec, 'word_index': word_index, 'data': res }
def cutter_filter(self): """ 切割后的轨迹存储格式: (dict) { uid: [ [ [loc, tim], [loc, tim], ... ], [ [loc, tim], [loc, tim], ... ], ... ], ... } """ # load data according to config traj = pd.read_csv( os.path.join(self.data_path, '{}.dyna'.format(self.config['dataset']))) user_set = pd.unique(traj['entity_id']) res = {} min_session_len = self.config['min_session_len'] min_sessions = self.config['min_sessions'] time_window_size = self.config['time_window_size'] base_zero = time_window_size > 12 for uid in user_set: usr_traj = traj[traj['entity_id'] == uid] sessions = [] # 存放该用户所有的 session session = [] # 单条轨迹 # 这里还是使用当地时间吧 start_time = parse_time( usr_traj.iloc[0]['time'], int(usr_traj.iloc[0]['timezone_offset_in_minutes'])) base_time = cal_basetime(start_time, base_zero) for index, row in usr_traj.iterrows(): if index == 0: assert start_time.hour - base_time.hour < time_window_size # time encode from 0 ~ time_window_size session.append( [row['location'], start_time.hour - base_time.hour]) else: now_time = parse_time( row['time'], int(row['timezone_offset_in_minutes'])) time_off = cal_timeoff(now_time, base_time) if time_off < time_window_size and time_off >= 0: assert int(time_off) < time_window_size session.append([row['location'], int(time_off)]) else: if len(session) >= min_session_len: sessions.append(session) session = [] start_time = now_time base_time = cal_basetime(start_time, base_zero) session.append([ row['location'], start_time.hour - base_time.hour ]) if len(session) >= min_session_len: sessions.append(session) if len(sessions) >= min_sessions: res[str(uid)] = sessions # 这里的 uid_size 和 loc_size 可能要大于实际的 uid 和 loc,因为有些可能被过滤掉了 poi = pd.read_csv( os.path.join(self.data_path, '{}.geo'.format(self.config['dataset']))) loc_size = poi.shape[0] uid_size = len(user_set) print('loc_size: {}, uid_size: {}'.format(loc_size, uid_size)) return { 'loc_size': loc_size, 'tim_size': time_window_size, 'uid_size': uid_size, 'data': res }