def encode(self, uid, trajectories, negative_sample=None):
     """standard encoder use the same method as DeepMove
     Recode poi id. Encode timestamp with its hour.
     Args:
         uid ([type]): same as AbstractTrajectoryEncoder
         trajectories ([type]): same as AbstractTrajectoryEncoder
             trajectory1 = [
                 (location ID, timestamp, timezone_offset_in_minutes),
                 (location ID, timestamp, timezone_offset_in_minutes),
                 .....
             ]
     """
     # 直接对 uid 进行重编码
     uid = self.uid
     self.uid += 1
     encoded_trajectories = []
     for index, traj in enumerate(trajectories):
         current_loc = []
         current_tim = []
         current_longi = []
         current_lati = []
         current_points = []
         start_time = parse_time(traj[0][2])
         # 以当天凌晨的时间作为计算 time_off 的基准
         base_time = cal_basetime(start_time, True)
         for point in traj:
             loc = point[4]
             now_time = parse_time(point[2])
             if loc not in self.location2id:
                 self.location2id[loc] = self.loc_id
                 self.loc_id += 1
             current_points.append(loc)
             current_loc.append(self.location2id[loc])
             current_lati.append(self.geo_coord[loc][0])
             current_longi.append(self.geo_coord[loc][1])
             time_code = int(cal_timeoff(now_time, base_time))
             if time_code > self.tim_max:
                 self.tim_max = time_code
             current_tim.append(time_code)
         # 完成当前轨迹的编码,下面进行输入的形成
         trace = []
         target = current_loc[-1]
         target_tim = current_tim[-1]
         current_loc = current_loc[:-1]
         current_tim = current_tim[:-1]
         lati = self.geo_coord[current_points[-1]][0]
         lati = np.array([lati for i in range(len(current_loc))])
         longi = self.geo_coord[current_points[-1]][1]
         longi = np.array([longi for i in range(len(current_loc))])
         current_dis = euclidean_dist(lati - current_lati[:-1],
                                      longi - current_longi[:-1])
         trace.append(current_loc)
         trace.append(current_tim)
         trace.append(target)
         trace.append(target_tim)
         trace.append(uid)
         trace.append(current_dis)
         encoded_trajectories.append(trace)
     return encoded_trajectories
Example #2
0
 def _cal_mat1(self, current_tim):
     # calculate the temporal relation matrix
     mat = np.zeros((self.max_len, self.max_len))
     cur_len = len(current_tim)
     for i in range(cur_len):
         for j in range(cur_len):
             off = abs(cal_timeoff(current_tim[i], current_tim[j]))
             mat[i][j] = off
             if off > self.ex[3]:
                 self.ex[3] = off
     return mat
 def cutter_filter(self):
     """
     切割后的轨迹存储格式: (dict)
         {
             uid: [
                 [
                     checkin_record,
                     checkin_record,
                     ...
                 ],
                 [
                     checkin_record,
                     checkin_record,
                     ...
                 ],
                 ...
             ],
             ...
         }
     """
     # load data according to config
     traj = pd.read_csv(os.path.join(
         self.data_path, '{}.dyna'.format(self.config['dataset'])))
     # filter inactive poi
     group_location = traj.groupby('location').count()
     filter_location = group_location[group_location['time'] >= self.config['min_checkins']]
     location_index = filter_location.index.tolist()
     traj = traj[traj['location'].isin(location_index)]
     user_set = pd.unique(traj['entity_id'])
     res = {}
     min_session_len = self.config['min_session_len']
     max_session_len = self.config['max_session_len']
     min_sessions = self.config['min_sessions']
     window_size = self.config['window_size']
     cut_method = self.config['cut_method']
     if cut_method == 'time_interval':
         # 按照时间窗口进行切割
         for uid in tqdm(user_set, desc="cut and filter trajectory"):
             usr_traj = traj[traj['entity_id'] == uid].to_numpy()
             sessions = []  # 存放该用户所有的 session
             session = []  # 单条轨迹
             for index, row in enumerate(usr_traj):
                 now_time = parse_time(row[2])
                 if index == 0:
                     session.append(row.tolist())
                     prev_time = now_time
                 else:
                     time_off = cal_timeoff(now_time, prev_time)
                     if time_off < window_size and time_off >= 0 and len(session) < max_session_len:
                         session.append(row.tolist())
                     else:
                         if len(session) >= min_session_len:
                             sessions.append(session)
                         session = []
                         session.append(row.tolist())
                 prev_time = now_time
             if len(session) >= min_session_len:
                 sessions.append(session)
             if len(sessions) >= min_sessions:
                 res[str(uid)] = sessions
     elif cut_method == 'same_date':
         # 将同一天的 check-in 划为一条轨迹
         for uid in tqdm(user_set, desc="cut and filter trajectory"):
             usr_traj = traj[traj['entity_id'] == uid].to_numpy()
             sessions = []  # 存放该用户所有的 session
             session = []  # 单条轨迹
             prev_date = None
             for index, row in enumerate(usr_traj):
                 now_time = parse_time(row[2])
                 now_date = now_time.day
                 if index == 0:
                     session.append(row.tolist())
                 else:
                     if prev_date == now_date and len(session) < max_session_len:
                         # 还是同一天
                         session.append(row.tolist())
                     else:
                         if len(session) >= min_session_len:
                             sessions.append(session)
                         session = []
                         session.append(row.tolist())
                 prev_date = now_date
             if len(session) >= min_session_len:
                 sessions.append(session)
             if len(sessions) >= min_sessions:
                 res[str(uid)] = sessions
     else:
         # cut by fix window_len used by STAN
         if max_session_len != window_size:
             raise ValueError('the fixed length window is not equal to max_session_len')
         for uid in tqdm(user_set, desc="cut and filter trajectory"):
             usr_traj = traj[traj['entity_id'] == uid].to_numpy()
             sessions = []  # 存放该用户所有的 session
             session = []  # 单条轨迹
             for index, row in enumerate(usr_traj):
                 if len(session) < window_size:
                     session.append(row.tolist())
                 else:
                     sessions.append(session)
                     session = []
                     session.append(row.tolist())
             if len(session) >= min_session_len:
                 sessions.append(session)
             if len(sessions) >= min_sessions:
                 res[str(uid)] = sessions
     return res
Example #4
0
    def _load_dyna(self):
        """
        轨迹存储格式: (dict)
            {
                uid: [
                    [
                        dyna_record,
                        dyna_record,
                        ...
                    ],
                    [
                        dyna_record,
                        dyna_record,
                        ...
                    ],
                    ...
                ],
                ...
            }
        """
        # load data according to config
        dyna_file = pd.read_csv(os.path.join(
            self.data_path, '{}.dyna'.format(self.config['dataset'])))
        self._logger.info("Loaded file " + self.config['dataset'] + '.dyna, shape=' + str(dyna_file.shape))
        self.dyna_feature_column = {col: i for i, col in enumerate(dyna_file)}
        res = dict()
        if self.need_cut:
            user_set = pd.unique(dyna_file['entity_id'])
            min_session_len = self.config['min_session_len']
            max_session_len = self.config['max_session_len']
            min_sessions = self.config['min_sessions']
            window_size = self.config['window_size']
            cut_method = self.config['cut_method']
            if cut_method == 'time_interval':
                # 按照时间窗口进行切割
                for uid in tqdm(user_set, desc="cut and filter trajectory"):
                    usr_traj = dyna_file[dyna_file['entity_id'] == uid]
                    usr_traj = usr_traj.sort_values(by='time')
                    usr_traj = usr_traj.reset_index(drop=True)
                    sessions = []  # 存放该用户所有的 session
                    traj_id = 0
                    session = []  # 单条轨迹
                    for index, row in usr_traj.iterrows():
                        row['traj_id'] = traj_id
                        now_time = parse_time(row['time'])
                        if index == 0:
                            session.append(row.tolist())
                            prev_time = now_time
                        else:
                            time_off = cal_timeoff(now_time, prev_time)
                            if time_off < window_size and time_off >= 0 and len(session) < max_session_len:
                                session.append(row.tolist())
                            else:
                                if len(session) >= min_session_len:
                                    sessions.append(session)
                                    traj_id += 1
                                session = []
                                session.append(row.tolist())
                        prev_time = now_time
                    if len(session) >= min_session_len:
                        sessions.append(session)
                        traj_id += 1
                    if len(sessions) >= min_sessions:
                        res[str(uid)] = sessions
            elif cut_method == 'same_date':
                # 将同一天的 check-in 划为一条轨迹
                for uid in tqdm(user_set, desc="cut and filter trajectory"):
                    usr_traj = dyna_file[dyna_file['entity_id'] == uid]
                    usr_traj = usr_traj.sort_values(by='time')
                    usr_traj = usr_traj.reset_index(drop=True)
                    sessions = []  # 存放该用户所有的 session
                    traj_id = 0
                    session = []  # 单条轨迹
                    prev_date = None
                    for index, row in usr_traj.iterrows():
                        row['traj_id'] = traj_id
                        now_time = parse_time(row['time'])
                        now_date = now_time.day
                        if index == 0:
                            session.append(row.tolist().append())
                        else:
                            if prev_date == now_date and len(session) < max_session_len:
                                session.append(row.tolist())
                            else:
                                if len(session) >= min_session_len:
                                    sessions.append(session)
                                    traj_id += 1
                                session = []
                                session.append(row.tolist())
                        prev_date = now_date
                    if len(session) >= min_session_len:
                        sessions.append(session)
                        traj_id += 1
                    if len(sessions) >= min_sessions:
                        res[str(uid)] = sessions
            else:
                # cut by fix window_len used by STAN
                if max_session_len != window_size:
                    raise ValueError('the fixed length window is not equal to max_session_len')
                for uid in tqdm(user_set, desc="cut and filter trajectory"):
                    usr_traj = dyna_file[dyna_file['entity_id'] == uid]
                    usr_traj = usr_traj.sort_values(by='time')
                    usr_traj = usr_traj.reset_index(drop=True)
                    sessions = []  # 存放该用户所有的 session
                    traj_id = 0
                    session = []  # 单条轨迹
                    for index, row in usr_traj.iterrows():
                        row['traj_id'] = traj_id
                        if len(session) < window_size:
                            session.append(row.tolist())
                        else:
                            sessions.append(session)
                            traj_id += 1
                            session = []
                            session.append(row.tolist())
                    if len(session) >= min_session_len:
                        sessions.append(session)
                        traj_id += 1
                    if len(sessions) >= min_sessions:
                        res[str(uid)] = sessions
        else:
            id_set = set()
            for dyna in dyna_file.itertuples():
                entity_id = getattr(dyna, "entity_id")
                traj_id = getattr(dyna, "traj_id")
                if (entity_id, traj_id) in id_set:
                    continue
                id_set.add((entity_id, traj_id))

                if entity_id not in res:
                    res[entity_id] = []
                rows = dyna_file[(dyna_file['entity_id'] == entity_id) & (dyna_file['traj_id'] == traj_id)]
                rows = rows.sort_values(by='time')
                traj = []
                for _, row in rows.iterrows():
                    traj.append(row.tolist())
                res[entity_id].append(traj[:])
        return res
Example #5
0
    def cutter_filter(self):
        """
        切割后的轨迹存储格式: (dict)
            {
                uid: [
                    [
                        checkin_record,
                        checkin_record,
                        ...
                    ],
                    [
                        checkin_record,
                        checkin_record,
                        ...
                    ],
                    ...
                ],
                ...
            }
        """
        # load data according to config
        traj = pd.read_csv(
            os.path.join(self.data_path,
                         '{}.dyna'.format(self.config['dataset'])))
        # filter inactive poi
        group_location = traj.groupby('location').count()
        filter_location = group_location[
            group_location['time'] > self.config['min_checkins']]
        location_index = filter_location.index.tolist()
        traj = traj[traj['location'].isin(location_index)]

        user_set = pd.unique(traj['entity_id'])
        res = {}
        min_session_len = self.config['min_session_len']
        min_sessions = self.config['min_sessions']
        window_size = self.config['window_size']
        window_type = self.config['window_type']
        if window_type == 'time_window':
            # 按照时间窗口进行切割
            base_zero = window_size > 12
            for uid in tqdm(user_set, desc="cut and filter trajectory"):
                usr_traj = traj[traj['entity_id'] == uid]
                sessions = []  # 存放该用户所有的 session
                session = []  # 单条轨迹
                # 这里还是使用当地时间吧
                start_time = parse_time(usr_traj.iloc[0]['time'])
                base_time = cal_basetime(start_time, base_zero)
                for index, row in usr_traj.iterrows():
                    if index == 0:
                        assert start_time.hour - base_time.hour < window_size
                        session.append(row.tolist())
                    else:
                        now_time = parse_time(row['time'])
                        time_off = cal_timeoff(now_time, base_time)
                        if time_off < window_size and time_off >= 0:
                            session.append(row.tolist())
                        else:
                            if len(session) >= min_session_len:
                                sessions.append(session)
                            session = []
                            start_time = now_time
                            base_time = cal_basetime(start_time, base_zero)
                            session.append(row.tolist())
                if len(session) >= min_session_len:
                    sessions.append(session)
                if len(sessions) >= min_sessions:
                    # update counter
                    for s in sessions:
                        for row in s:
                            self.counter.update({row[4]: 1})
                    res[str(uid)] = sessions
        else:
            # 按照轨迹长度进行划分
            for uid in tqdm(user_set, desc="cut and filter trajectory"):
                usr_traj = traj[traj['entity_id'] == uid]
                sessions = []  # 存放该用户所有的 session
                session = []  # 单条轨迹
                for index, row in usr_traj.iterrows():
                    if len(session) < window_size:
                        session.append(row.tolist())
                    else:
                        sessions.append(session)
                        session = []
                        session.append(row.tolist())
                if len(session) >= min_session_len:
                    sessions.append(session)
                if len(sessions) >= min_sessions:
                    # update counter
                    for s in sessions:
                        for row in s:
                            self.counter.update({row[4]: 1})
                    res[str(uid)] = sessions
        return res