def _create_dilated_rnn_input(self, current_loc): current_loc.reverse() sequence_length = len(current_loc) session_dilated_rnn_input_index = [0] * sequence_length for i in range(sequence_length - 1): current_poi = current_loc[i] poi_before = current_loc[i + 1:] current_poi_profile = self.poi_profile.loc[ self.poi_profile['geo_id'] == self.id2location[current_poi]].iloc[0] lon_cur, lat_cur = parse_coordinate( current_poi_profile['coordinates']) distance_row_explicit = [] for target in poi_before: lon, lat = parse_coordinate(self.poi_profile.loc[ self.poi_profile['geo_id'] == self.id2location[target]].iloc[0]['coordinates']) distance_row_explicit.append( geodistance(lat_cur, lon_cur, lat, lon)) index_closet = np.argmin(distance_row_explicit).item() # reverse back session_dilated_rnn_input_index[ sequence_length - i - 1] = sequence_length - 2 - index_closet - i current_loc.reverse() return session_dilated_rnn_input_index
def _cal_poi_matrix(self): poi_profile = pd.read_csv('./raw_data/{}/{}.geo'.format( self.config['dataset'], self.config['dataset'])) mat = np.zeros((self.loc_id - 1, self.loc_id - 1)) for i in tqdm(range(1, self.loc_id), desc='calculate poi distance matrix'): lon_i, lat_i = parse_coordinate( poi_profile.iloc[self.id2location[i]]['coordinates']) for j in range(1, self.loc_id): lon_j, lat_j = parse_coordinate( poi_profile.iloc[self.id2location[j]]['coordinates']) dis = haversine(lon_i, lat_i, lon_j, lat_j) mat[i - 1][j - 1] = dis if dis > self.ex[0]: self.ex[0] = dis return mat.tolist()
def calculate_loss(self, batch): user = batch['uid'] dst = batch['target'].tolist() dst_time = batch['target_tim'] current_loc = batch['current_loc'] current_tim = batch['current_tim'] # 计算 td ld batch_size = len(dst) td = dst_time.unsqueeze(1) - current_tim ld = torch.zeros(current_loc.shape).to(self.device) loc_len = batch.get_origin_len('current_loc') current_loc = current_loc.tolist() for i in range(batch_size): target = dst[i] lon_i, lat_i = parse_coordinate( self.poi_profile.iloc[target]['coordinates']) for j in range(loc_len[i]): origin = current_loc[i][j] lon_j, lat_j = parse_coordinate( self.poi_profile.iloc[origin]['coordinates']) # 计算 target - origin 的距离,并写入 ld[i][j] 中 ld[i][j] = distance.distance((lat_i, lon_i), (lat_j, lon_j)).kilometers td_upper = torch.LongTensor([self.up_time] * batch_size).to( self.device).unsqueeze(1) td_upper = td_upper - td td_lower = td # 因为 lower 是 0 ld_upper = torch.LongTensor([self.up_loc] * batch_size).to( self.device).unsqueeze(1) ld_upper = ld_upper - ld ld_lower = ld # 因为下界是 0 # batch_size * hidden_size h_tq = self.forward(td_upper, td_lower, ld_upper, ld_lower, batch['current_loc'], loc_len) dst = batch['target'] p_u = self.permanet_weight(user) # batch_size * hidden_size q_v = self.location_weight(dst) # batch_size * hidden_size user_vector = h_tq + p_u output = torch.zeros([batch_size, 1]) for i in range(batch_size): output[i] = torch.dot(user_vector[i], q_v[i]) output = torch.sum(output, dim=0) return torch.log(1 + torch.exp(torch.neg(output)))
def predict(self, batch): user = batch['uid'] dst = batch['target'].tolist() dst_time = batch['target_tim'] current_loc = batch['current_loc'] current_tim = batch['current_tim'] # 计算 td ld batch_size = len(dst) td = dst_time.unsqueeze(1) - current_tim ld = torch.zeros(current_loc.shape).to(self.device) loc_len = batch.get_origin_len('current_loc') current_loc = current_loc.tolist() for i in range(batch_size): target = dst[i] lon_i, lat_i = parse_coordinate( self.poi_profile.iloc[target]['coordinates']) for j in range(loc_len[i]): origin = current_loc[i][j] lon_j, lat_j = parse_coordinate( self.poi_profile.iloc[origin]['coordinates']) # 计算 target - origin 的距离,并写入 ld[i][j] 中 ld[i][j] = distance.distance((lat_i, lon_i), (lat_j, lon_j)).kilometers td_upper = torch.LongTensor([self.up_time] * batch_size).to( self.device).unsqueeze(1) td_upper = td_upper - td td_lower = td # 因为 lower 是 0 ld_upper = torch.LongTensor([self.up_loc] * batch_size).to( self.device).unsqueeze(1) ld_upper = ld_upper - ld ld_lower = ld # 因为下界是 0 # batch_size * hidden_size h_tq = self.forward(td_upper, td_lower, ld_upper, ld_lower, batch['current_loc'], loc_len) p_u = self.permanet_weight(user) # batch_size * hidden_size user_vector = h_tq + p_u # batch_size * hidden_size # 这里有问题,因为 user_vector 是依据 target 来算的,实际上应该是每个 loc 一个对应的 user_vector # batch_size * loc_size ret = torch.mm(user_vector, self.location_weight.weight.T) return ret
def _gen_distance_matrix(self, current_loc, history_loc_central): # 使用 profile 计算当前位置与历史轨迹中心点之间的距离 history_avg_distance = [] # history_session_count now_loc = current_loc[-1] lon_cur, lat_cur = parse_coordinate( self.poi_profile.loc[self.poi_profile['geo_id'] == self. id2location[now_loc]].iloc[0]['coordinates']) for central in history_loc_central: dis = geodistance(central[0], central[1], lat_cur, lon_cur) if dis < 1: dis = 1 history_avg_distance.append(dis) return history_avg_distance
def encode(self, uid, trajectories, negative_sample=None): """standard encoder use the same method as DeepMove Recode poi id. Encode timestamp with its hour. Args: uid ([type]): same as AbstractTrajectoryEncoder trajectories ([type]): same as AbstractTrajectoryEncoder trajectory1 = [ (location ID, timestamp, timezone_offset_in_minutes), (location ID, timestamp, timezone_offset_in_minutes), ..... ] """ # 直接对 uid 进行重编码 uid = self.uid self.uid += 1 encoded_trajectories = [] history_loc = [] history_loc_central = [] history_tim = [] for index, traj in enumerate(trajectories): current_loc = [] current_tim = [] for point in traj: loc = point[4] now_time = parse_time(point[2]) if loc not in self.location2id: self.location2id[loc] = self.loc_id self.id2location[self.loc_id] = loc self.loc_id += 1 current_loc.append(self.location2id[loc]) time_code = self._time_encode(now_time) current_tim.append(time_code) if time_code not in self.time_checkin_set: self.time_checkin_set[time_code] = set() self.time_checkin_set[time_code].add(self.location2id[loc]) # 完成当前轨迹的编码,下面进行输入的形成 if index == 0: # 因为要历史轨迹特征,所以第一条轨迹是不能构成模型输入的 history_loc.append(current_loc) history_tim.append(current_tim) lon = [] lat = [] for poi in current_loc: lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[ self.poi_profile['geo_id'] == self.id2location[poi]].iloc[0]['coordinates']) lon.append(lon_cur) lat.append(lat_cur) history_loc_central.append((np.mean(lat), np.mean(lon))) continue # 一条轨迹可以生成多个数据点 for i in range(len(current_loc) - 1): trace = [] target = current_loc[i + 1] dilated_rnn_input_index = self._create_dilated_rnn_input( current_loc[:i + 1]) history_avg_distance = self._gen_distance_matrix( current_loc[:i + 1], history_loc_central) trace.append(history_loc.copy()) trace.append(history_tim.copy()) trace.append(current_loc[:i + 1]) trace.append(current_tim[:i + 1]) trace.append(dilated_rnn_input_index) trace.append(history_avg_distance) trace.append(target) trace.append(uid) if negative_sample is not None: neg_loc = [] for neg in negative_sample[index]: if neg not in self.location2id: self.location2id[neg] = self.loc_id self.loc_id += 1 neg_loc.append(self.location2id[neg]) trace.append(neg_loc) encoded_trajectories.append(trace) history_loc.append(current_loc) history_tim.append(current_tim) # calculate current_loc lon = [] lat = [] for poi in current_loc: lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[ self.poi_profile['geo_id'] == self.id2location[poi]].iloc[0]['coordinates']) lon.append(lon_cur) lat.append(lat_cur) history_loc_central.append((np.mean(lat), np.mean(lon))) return encoded_trajectories