def get_trace_count_and_point(path1):
    user_list = loc_record.curdir_file(path1)
    trace_count = 0  #轨迹数
    trace_point = 0  #轨迹点数
    for i in user_list:
        path = path1 + os.path.sep + i
        data = pd.read_csv(open(path))
        trace_count = trace_count + data['data_string'].nunique()
        trace_point = trace_point + len(data)
    print("轨迹数:", trace_count, "轨迹点数:", trace_point)
    return trace_count, trace_point


#get_distance(34.2676434736,108.920156846,34.2683182934,108.967080573)

#提取北京地区的数据
#city_data(r'F:\GPS\User_Data_csv_format',r'F:\GPS\Pre_Data\Beijing_Data',39.26,41.6,115.25,117.4))
#预处理:去除速度大于300km/h的后继点
#preprocessing(r'F:\GPS\Pre_Data\Beijing_Data',r'F:\GPS\Pre_Data\Prepro_Data,300)
# 轨迹按时间和距离提取,10min,500m
#trajectory_extra(r'F:\GPS\Pre_Data\Prepro_Data',r'F:\GPS\Pre_Data\Extra_Data',10,500)

# 获取轨迹及轨迹点数目
#get_trace_count_and_point(r'F:\GPS\Pre_Data\Prepro_Data')
#get_trace_count_and_point(r'F:\GPS\Pre_Data\Extra_Data')
Beispiel #2
0
def semantic_trasform(path_in,path_out):

    start = time.time()
    user_list = loc_record.curdir_file(path_in)

    for user in user_list:

        #数据读取
        path_in1 = path_in + os.path.sep + user
        gps_data = pd.read_csv(open(path_in1))

        #创建数据列
        a = np.zeros(len(gps_data))
        lat_baidu = pd.DataFrame(a, columns=['lat_baidu'])
        b = np.zeros(len(gps_data))
        lon_baidu = pd.DataFrame(b, columns=['lon_baidu'])
        city = pd.DataFrame(a, columns=['city'])
        street = pd.DataFrame(a, columns=['street'])
        poiReg_name = pd.DataFrame(a, columns=['poiReg_name'])
        sematic_descrip = pd.DataFrame(a, columns=['sematic_descrip'])

        #读取语义转换数据
        for i in range(0,len(gps_data)):

            lat = gps_data.iloc[i]['lat']
            lon = gps_data.iloc[i]['lon']

            sem_result = jsonFormat(lat,lon)
            if sem_result == {} :
                continue
            else:
                print(sem_result)
                lat_baidu.iloc[i] = sem_result['lat_baidu']
                lon_baidu.iloc[i] = sem_result['lon_baidu']
                city.iloc[i] = sem_result['city']
                street.iloc[i] = sem_result['street']
                poiReg_name.iloc[i] = sem_result['poiReg_name']
                sematic_descrip.iloc[i] = sem_result['sematic_descrip']

        #添加到原始数据后面
        gps_data['lat_baidu'] = lat_baidu
        gps_data['lon_baidu'] = lon_baidu
        gps_data['city'] = city
        gps_data['street'] = street
        gps_data['poiReg_name'] = poiReg_name
        gps_data['sematic_descrip'] = sematic_descrip
        #print(gps_data)

        if not os.path.exists(path_out):  # 检验给出的路径是否存在
            os.makedirs(path_out)
        new_path = path_out + os.path.sep + user
        gps_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Semantic_Data done')

        end = time.time()
        print(end - start)

#地点语义化
#semantic_trasform('F:\GPS\Feature_Data\Effica_again_Data_v5',r'F:\GPS\Feature_Data\Semantic_Data_v5')
#jsonFormat(39.90613,116.375697)
def city_data(path1, path2, lat_range1, lat_range2, lon_range1, lon_range2):
    file_list = loc_record.curdir_file(path1)
    for i in file_list:
        #new_data = []
        path = path1 + os.path.sep + i
        data = pd.read_csv(open(path))
        new_data = data[(data['lat'] > lat_range1) & (data['lat'] < lat_range2)
                        & (data['lon'] > lon_range1) &
                        (data['lon'] < lon_range2)]
        print(data['data_string'].dtype)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        new_data.to_csv(new_path, index=False, encoding='gbk')
        print(i + 'Beijing_Data save csv done')
        ''''
        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        new_file = open(new_path, 'w', newline='')
        myWriter = csv.writer(new_file, dialect='excel');
        myWriter.writerow(['lat', 'lon', '0', 'alt', 'timestamp', 'data', 'time']);
        myWriter.writerows(file_list);
        new_file.close()
        print(i + ' filtrate done')
        ''' ''
def add_degree_diff(path1,path2):
    user_list = loc_record.curdir_file(path1)
    for i in user_list:
        path = path1 + os.path.sep + i
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'], format='%H:%M:%S')
        #data2 = pd.DataFrame(data1,columns=['lat', 'lon','alt','date_string','time_string'])
        #data2 = data1.copy()
        a = np.zeros(len(data2))
        degree_diff_col = pd.DataFrame(a,columns=['degree_diff'])
        degree_diff_col.iloc[0] = nan
        degree_diff_col.iloc[1] = nan

        for j in range(1,len(data2) - 1):
            day_interval = trajectory_extraction.get_day_interval(data2.iloc[j]['date_string'], data2.iloc[j+1]['date_string'])
            if day_interval == 0:
                degree_diff = get_degree_diff(data2.iloc[j]['degree'],data2.iloc[j+1]['degree'])
            #print(degree)
                degree_diff_col.iloc[j+1] = degree_diff
            #data2.iloc[j + 1,'degree'] = degree
            #print(degree_col)
            else:
                degree_diff_col.iloc[j + 1] = nan
        data1.insert(5,'degree_diff',degree_diff_col)
        #print(data2)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        data1.to_csv(new_path, index=False, encoding='gbk')
        print(i + ' Degree_diff_Data done')
def screen_user(path1,path2,traje_range,date_range):
    user_list = loc_record.curdir_file(path1)
    for i in user_list:
        path = path1 + os.path.sep + i
        data1 = pd.read_csv(open(path))
        #data1.rename(columns={'data_string':'date_string'},inplace=True)
        #data2 = pd.read_csv(open(path))
        #data2['data_string'] = pd.to_datetime(data2['data_string'], format='%Y-%m-%d')
        #data2['time_string'] = pd.to_datetime(data2['time_string'], format='%H:%M:%S')
        date_value = []

        distinct_values = data1['date_string'].unique()
        total_count = data1['date_string'].nunique()
        for j in distinct_values :
            repet_count = list(data1['date_string']).count(j)
            if repet_count <= traje_range :
                date_value.append(j)

        valid_date =  total_count - len(date_value)
        if valid_date <= date_range :
            continue
        else:
            new_data = data1[~data1['date_string'].isin(date_value)]
            #print(new_data)

            if not os.path.exists(path2):  # 检验给出的路径是否存在
                os.makedirs(path2)
            new_path = path2 + os.path.sep + i
            new_data.to_csv(new_path, index=False, encoding='gbk')
            print(i + ' Effica_Data done') # 有效数据
def add_degree_feature(path1, path2):
    user_list = loc_record.curdir_file(path1)
    for user in user_list:
        path = path1 + os.path.sep + user
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))

        a = np.zeros(len(data2))
        degree_col = pd.DataFrame(a, columns=['degree_fea'])

        for j in range(0, len(data2)):
            degree = data2.iloc[j]['degree']
            if degree == 0:
                degree_col.iloc[j] = 0
            elif (0 < degree and degree < 22.5) or (337.5 < degree):
                degree_col.iloc[j] = 1
            else:
                degree_col.iloc[j] = math.ceil((degree - 22.5) / 45)

        data1['degree_fea'] = degree_col


        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        data1.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Degree_fea done')
def add_time_interval(path1, path2):

    user_list = loc_record.curdir_file(path1)
    for user in user_list:
        path = path1 + os.path.sep + user
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['arrive_time'] = pd.to_datetime(data2['arrive_time'], format='%H:%M:%S')
        data2['leave_time'] = pd.to_datetime(data2['leave_time'], format='%H:%M:%S')

        a = np.zeros(len(data2))
        time_interval_col = pd.DataFrame(a, columns=['time_interval'])


        for j in range(0, len(data2) - 1):

            if data2.iloc[j]['leave_time'] != 'None':
                time_interval = trajectory_extraction.get_time_interval(data2.iloc[j]['arrive_time'], data2.iloc[j]['leave_time'])
                time_interval_col.iloc[j] = time_interval

            else:
                time_interval_col.iloc[j] = nan

        data1.insert(5, 'time_diff', time_interval_col)


        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        data1.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Time_interval_Data done')
def add_degree(path1,path2):
    user_list = loc_record.curdir_file(path1)
    for i in user_list:
        path = path1 + os.path.sep + i
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')

        a = np.zeros(len(data2))
        degree_col = pd.DataFrame(a,columns=['degree'])
        degree_col.iloc[0] = 0

        for j in range(0,len(data2) - 1):
            day_interval = trajectory_extraction.get_day_interval(data2.iloc[j]['date_string'], data2.iloc[j+1]['date_string'])
            if day_interval == 0:
                degree = get_degree(data2.iloc[j]['lat'], data2.iloc[j]['lon'], data2.iloc[j+1]['lat'],data2.iloc[j+1]['lon'])
                degree_col.iloc[j+1] = degree
            else:
                degree_col.iloc[j + 1] = 0
        data1.insert(6,'degree',degree_col)


        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        data1.to_csv(new_path, index=False, encoding='gbk')
        print(i + ' Degree_Data done')
def get_trace_count_and_cluster(path1,id):
    user_list = loc_record.curdir_file(path1)
    trace_count = 0 #轨迹数
    trace_cluster = 0 #驻足点数
    for i in user_list:
        path = path1 + os.path.sep + i
        data = pd.read_csv(open(path))
        trace_count = trace_count + data['date_string'].nunique()
        trace_cluster = trace_count + data[id].nunique()
    print("轨迹数:",trace_count,"驻足点数:",trace_cluster)
    return trace_count,trace_cluster
def get_trace_count_and_point(path1):
    user_list = loc_record.curdir_file(path1)
    trace_count = 0 #轨迹数
    trace_point = 0 #轨迹点数
    for i in user_list:
        path = path1 + os.path.sep + i
        data = pd.read_csv(open(path))
        trace_count = trace_count + data['date_string'].nunique()
        trace_point = trace_point + len(data)
    print("轨迹数:",trace_count,"轨迹点数:",trace_point)
    return trace_count,trace_point
def trajectory_extra(path1, path2, time_interval_range,
                     distance_interval_range):
    user_list = loc_record.curdir_file(path1)
    for i in user_list:
        path = path1 + os.path.sep + i
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['data_string'] = pd.to_datetime(data2['data_string'],
                                              format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'],
                                              format='%H:%M:%S')

        new_index = [0]
        begin = 0
        j = 1
        while begin + j <= (len(data2) - 1):
            day_interval = get_day_interval(
                data2.iloc[begin]['data_string'],
                data2.iloc[begin + j]['data_string'])
            if day_interval == 0:
                time_interval = get_time_interval(
                    data2.iloc[begin]['time_string'],
                    data2.iloc[begin + j]['time_string'])
                distance_interval = get_distance(data2.iloc[begin]['lat'],
                                                 data2.iloc[begin]['lon'],
                                                 data2.iloc[begin + j]['lat'],
                                                 data2.iloc[begin + j]['lon'])
                if time_interval >= (time_interval_range * 60):
                    new_index.append(begin + j)
                    begin = begin + j
                    j = 0
                if distance_interval >= distance_interval_range:
                    if (begin + j) not in new_index:
                        new_index.append(begin + j)
                        begin = begin + j
                        j = 0
            if day_interval != 0:
                begin = begin + j
                j = 0
            j = j + 1
        if len(new_index) != 1:
            new_data = data1.iloc[new_index, :]
        if len(new_index) == 1:
            new_data = pd.DataFrame(columns=[
                'lat', 'lon', '0', 'alt', 'date', 'data_string', 'time_string'
            ])
        #print(new_index)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        new_data.to_csv(new_path, index=False, encoding='gbk')
        print(i + ' Extra_Data done')
def get_clusters(path1, path2):

    user_list = loc_record.curdir_file(path1)

    for user in user_list:

        path = path1 + os.path.sep + user
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'], format='%H:%M:%S')


        new_clusters_data = pd.DataFrame(
            columns=['impoint_id', 'lat', 'lon', 'date_string', 'arrive_time','leave_time','arrive_index','leave_index'])
        important_point_id = data1['impoint_id'].unique()

        ''''
        lat_avge = data1.groupby(['cluster_id'])[['lat']].mean()
        leave_time = data1.groupby(['cluster_id'])[['time_string']].max()
        date = data1.groupby(['cluster_id'])[['date_string']].max()
        print(pd.concat(lat_avge,leave_time,date),axis=1,ignore_index=True)
        '''
        # 结果合并
        for i in important_point_id:

            lat_aver = data1[data1['impoint_id'] == i]['lat'].mean()
            lon_aver = data1[data1['impoint_id'] == i]['lon'].mean()
            date = data1[data1['impoint_id'] == i]['date_string'].max()
            arrive_time = data1[data1['impoint_id'] == i]['time_string'].min()
            leave_time = data1[data1['impoint_id'] == i]['time_string'].max()
            arrive_index = data1[data1['impoint_id'] == i]['index_id'].min()
            leave_index = data1[data1['impoint_id'] == i]['index_id'].max()

            #new_impoint_data = new_impoint_data.append({'cluster_id': cluster_id, 'index_id': index, 'lat': data1.iloc[index]['lat'],
                                        #'lon': data1.iloc[index]['lon'],
                                        #'date_string': data1.iloc[index]['date_string'],
                                        #'time_string': data1.iloc[index]['time_string']}, ignore_index=True)
            new_clusters_data = new_clusters_data.append(
                {'impoint_id': i, 'lat':lat_aver, 'lon':lon_aver, 'date_string':date, 'arrive_time':arrive_time,'leave_time':leave_time,'arrive_index':arrive_index,'leave_index':leave_index}, ignore_index=True)
        #print(new_clusters_data)
        # 结果输出至csv
        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        new_clusters_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Clusters_Data done')
def integrate_poi(path1, path2):

    user_list = loc_record.curdir_file(path1)

    for user in user_list:
        path = path1 + os.path.sep + user
        data = pd.read_csv(open(path))

        a = np.zeros(len(data))
        poi_col = pd.DataFrame(a, columns=['poi'])
        not_city_index = []


        for i in range(0,len(data)):

            city = data.iloc[i]['city']

            #如果不在北京市
            if city != '北京市':
                not_city_index.append(i)

        for j in range(0,len(data)):

            if j in not_city_index :
                continue
            else:
                street = data.iloc[j]['street']
                poiReg_name = data.iloc[j]['poiReg_name']

                #如果区域范围不存在,则用街道表示
                if poiReg_name is np.nan:
                    poi_col.iloc[j] = street

                #如果区域范围名存在,则表示为区域范围名
                else:
                    poi_col.iloc[j] = poiReg_name

        data['poi'] = poi_col

        new_data = data.drop(not_city_index)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        new_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' poi_Data done')
def preprocessing(path1, path2, speed_range):
    file_list = loc_record.curdir_file(path1)
    for i in file_list:
        path = path1 + os.path.sep + i
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['data_string'] = pd.to_datetime(data2['data_string'],
                                              format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'],
                                              format='%H:%M:%S')

        new_index = []
        for j in range(0, len(data2) - 1):
            day_interval = get_day_interval(data2.iloc[j]['data_string'],
                                            data2.iloc[j + 1]['data_string'])
            if day_interval == 0:
                speed = get_speed(data2.iloc[j]['lat'], data2.iloc[j]['lon'],
                                  data2.iloc[j + 1]['lat'],
                                  data2.iloc[j + 1]['lon'],
                                  data2.iloc[j]['time_string'],
                                  data2.iloc[j + 1]['time_string'])
                if speed == 0:
                    continue
                #print(speed)
                #print(j)
                if speed > ((speed_range * 1000) / (1 * 60 * 60)):
                    #print(j)
                    new_index.append(j)
        new_data = data1.drop(new_index)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        new_data.to_csv(new_path, index=False, encoding='gbk')
        print(i + ' Prepro_Data save csv done')
        ''''
        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + i
        new_file = open(new_path, 'w', newline='')
        myWriter = csv.writer(new_file, dialect='excel');
        myWriter.writerow(['lat', 'lon', '0', 'alt', 'timestamp', 'data', 'time']);
        myWriter.writerows(file_list);
        new_file.close()
        print(i + ' filtrate done')
        ''' ''
def add_time_feature(path1, path2,time_rage):

    user_list = loc_record.curdir_file(path1)

    for user in user_list:
        path = path1 + os.path.sep + user
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['arrive_time'] = pd.to_datetime(data2['arrive_time'], format='%H:%M:%S')
        #data2['arrive_time'] = datetime.datetime.strptime(data2.iloc[1]['arrive_time'], "%H:%M:%S")
        data2['leave_time'] = pd.to_datetime(data2['leave_time'], format='%H:%M:%S')

        a = np.zeros(len(data2))
        ari_time_col = pd.DataFrame(a, columns=['arrive_time_fea'])
        b = np.zeros(len(data2))
        lea_time_col = pd.DataFrame(b, columns=['leave_time_fea'])

        t0 = pd.to_datetime('0:0:0', format='%H:%M:%S')

        for j in range(0, len(data2)):

            arrive_time = data2.iloc[j]['arrive_time']
            leave_time = data2.iloc[j]['leave_time']

            #计算时间差
            ta = (arrive_time - t0 ).seconds
            tl = (leave_time - t0 ).seconds

            #时间划分,向上取整
            ari_time_col.iloc[j] = math.ceil(ta/(time_rage*60))
            lea_time_col.iloc[j] = math.ceil(tl/(time_rage*60))

        #print(degree_col)

        data1['arrive_time_fea'] = ari_time_col
        data1['leave_time_fea'] = lea_time_col


        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        data1.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Time_fea_Data done')
Beispiel #16
0
    # ensure all data is float
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    reframed = series_to_supervised(scaled, 1, 1)
    # drop columns we don't want to predict
    reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
    print(reframed.head())
    '''
start_traindata = time.time()
#构造训练测试集
#path1 = r'F:\GPS\Feature_Data\Effica_Feature_Data'
path1 = r'F:\GPS\Feature_Data\Effica_Feature_Data'
user_list = loc_record.curdir_file(path1)
seq_length = 4
total_x = []
train_x = []
train_y = []
test_x = []
test_y = []

for user in user_list:
    path = path1 + os.path.sep + user
    data = pd.read_csv(open(path))
    values = data.values
    day_interval_index = []

    for i in range(0, len(data)-1):
        if data.iloc[i]['date_string'] != data.iloc[i + 1]['date_string']:
def feature_represent(path1, path2,poi_model_path):
    # load dataset
    user_list = loc_record.curdir_file(path1)

    for user in user_list:
        path = path1 + os.path.sep + user
        data = pd.read_csv(open(path))

        # a = np.zeros(len(data))
        # poi_col = pd.DataFrame(a, columns=['poi(t)'])
        # b = np.zeros(len(data))
        # poi_next_col = pd.DataFrame(b, columns=['poi(t+1)'])
        poi_col = []
        poi_next_col = []

        not_poi = []

        model = Word2Vec.load(poi_model_path)

        # 获取poi的词向量
        for i in range(0, len(data)):
            # 如果不是末尾
            if i != len(data) - 1:
                # 如果是同一天
                if data.iloc[i]['date_string'] == data.iloc[i + 1]['date_string']:

                    # 如果poi和下一个poi存在,用模型得到向量
                    if (data.iloc[i]['poi'] in model.wv.vocab) and (data.iloc[i + 1]['poi'] in model.wv.vocab):
                        poi_fea = model[data.iloc[i]['poi']]
                        poi_next_fea = model[data.iloc[i + 1]['poi']]
                        # poi_col.iloc[i] = poi_fea
                        # poi_next_col.iloc[i+1] = poi_next_fea
                        poi_col.append(poi_fea)
                        poi_next_col.append(poi_next_fea)

                    # 不存在
                    else:
                        not_poi.append(i)

                # 如果不是同一天
                else:
                    if data.iloc[i]['poi'] in model.wv.vocab:
                        poi_fea = model[data.iloc[i]['poi']]
                        # poi_col.iloc[i] = poi_fea
                        # poi_next_col.iloc[i + 1] = poi_fea
                        poi_col.append(poi_fea)
                        poi_next_col.append(poi_fea)
                    else:
                        not_poi.append(i)
            # 如果是末尾
            else:
                if data.iloc[i]['poi'] in model.wv.vocab:
                    poi_fea = model[data.iloc[i]['poi']]
                    # poi_col.iloc[i] = poi_fea
                    # poi_next_col.iloc[i + 1] = poi_fea
                    poi_col.append(poi_fea)
                    poi_next_col.append(poi_fea)
                else:
                    not_poi.append(i)

        # 去除无效的poi特征的行
        data_drop = data.drop(not_poi)
        new_data_drop = pd.DataFrame(
            columns=['lat', 'lon', 'date_string', 'arrive_time', 'leave_time',
                    'time_diff', 'degree', 'lat_baidu', 'lon_baidu', 'city', 'street',
                    'poiReg_name', 'sematic_descrip', 'arrive_time_fea', 'leave_time_fea',
                    'degree_fea', 'poi'])

        new_data_drop = new_data_drop.append(data_drop, ignore_index=True)
        poi_fea_data = pd.DataFrame(poi_col)

        # 合并有效数据及t时刻poi特征
        new_data1 = pd.concat([new_data_drop, poi_fea_data], axis=1)

        # 添加其他维度特征
        # arrive_time_values = data[:,12:15].values
        arrive_time_values = new_data_drop['arrive_time_fea'].values
        leave_time_values = new_data_drop['leave_time_fea'].values
        time_diff_values = new_data_drop['time_diff'].values
        degree_fea_values = new_data_drop['degree_fea'].values

        #归一化
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaled_arrive_time = scaler.fit_transform(arrive_time_values)
        scaled_leave_time = scaler.fit_transform(leave_time_values)
        scaled_time_diff = scaler.fit_transform(time_diff_values)
        scaled_degree_fea = scaler.fit_transform(degree_fea_values)

        arrive_time_col = pd.DataFrame(scaled_arrive_time, columns=['arrive_time_fea(t)'])
        leave_time_col = pd.DataFrame(scaled_leave_time, columns=['leave_time_fea(t)'])
        time_diff_col = pd.DataFrame(scaled_time_diff, columns=['time_diff(t)'])
        degree_fea_col = pd.DataFrame(scaled_degree_fea, columns=['degree_fea(t)'])

        new_data1['arrive_time_fea(t)'] = arrive_time_col
        new_data1['leave_time_fea(t)'] = leave_time_col
        new_data1['time_diff(t)'] = time_diff_col
        new_data1['degree_fea(t)'] = degree_fea_col

        # t+1时刻的poi特征向量
        poi_next_fea_data = pd.DataFrame(poi_next_col)
        new_data = pd.concat([new_data1, poi_next_fea_data], axis=1)

        # print(new_data)
        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        new_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Feature_Data done')

# 时间特征提取
#add_time_feature(r'F:\GPS\Clusters_Data\Semantic_Data1',r'F:\GPS\Feature_Data\Time_fea_Data1',10)#时间划分

# 方位角特征提取
#add_degree_feature(r'F:\GPS\Feature_Data\Time_fea_Data',r'F:\GPS\Feature_Data\Degree_fea_Data')

# poi地点整合,如果区域范围名存在,则表示为区域范围名,不存在,则用街道表示
#integrate_poi(r'F:\GPS\Feature_Data\Degree_fea_Data',r'F:\GPS\Feature_Data\poi_Data')

#feature_generation.screen_user(r'F:\GPS\Feature_Data\poi_Data1', r'F:\GPS\Feature_Data\Effica_Data1_v1', 5, 20)

# 读取原始csv文件,将同天轨迹放在一行,poi放入txt中,为word2vec做准备
#poi_transform(r'F:\GPS\Feature_Data\Effica_Data_v1',r'F:\GPS\Feature_Data\poi_trans_Data')#txt是往下继续写,不是重写

# word2vec的位置特征向量提取
#get_poi_model(r'F:\GPS\Feature_Data\poi_trans_Data',r'F:\GPS\Feature_Data\poi_model_Data','alluser_model_10')#model_name
#use_model(r'F:\GPS\Feature_Data\poi_model_Data','alluser_model_10','八达岭野生动物世界')

# 各种特征向量整合
#feature_represent(r'F:\GPS\Feature_Data\Effica_Data_v1',r'F:\GPS\Feature_Data\Feature_Data',r'F:\GPS\Feature_Data\poi_model_Data\alluser_model_10.model')

#feature_generation.screen_user(r'F:\GPS\Feature_Data\Feature_Data', r'F:\GPS\Feature_Data\Effica_Feature_Data', 5, 20)
def poi_transform(path1,path2):

    user_list = loc_record.curdir_file(path1)

    for user in user_list:

        path = path1 + os.path.sep + user
        data = pd.read_csv(open(path))

        all_traj = []
        j = 0

        #遍历提取poi
        while j < len(data):

            '''
            #如果是最后一条记录
            if j == len(data)-1 :

                if data.iloc[j]['date_string'] == data.iloc[j-1]['date_string'] :
                    poi = data.iloc[j]['poi'] + ' '
                    oneday_traj = oneday_traj + poi
                    all_traj.append(oneday_traj)
                    break
                else:
                    oneday_traj = data.iloc[j]['date_string']
                    all_traj.append(oneday_traj)
                    break
            
            # 如果不是最后一条记录
            else:
            '''
            # 如果当前记录poi为空
            if data.iloc[j]['poi'] is np.nan:
                j = j+1
                continue
            # 如果当前记录poi不为空
            else:
                oneday_traj = data.iloc[j]['poi']

                #与后续数据对比
                for i in range(1, len(data)+1):

                    #如果超过数据长度
                    if j+i >= len(data):
                        all_traj.append(oneday_traj)
                        j = j + i
                        break
                    #如果在数据长度内
                    else:
                        # 如果是同一天的,放进同一条行
                        if data.iloc[j]['date_string'] == data.iloc[j + i]['date_string']:
                            #判断是否为空
                            if data.iloc[j + i]['poi'] is np.nan:
                                continue
                            #不为空与当前记录 字符串 合并
                            else:
                                # oneday_traj.append(data.iloc[j]['poi'])
                                poi = ' ' + data.iloc[j + i]['poi']
                                oneday_traj = oneday_traj + poi
                        # 如果不是同一天的,跳出i的循环重新开始比较
                        else:
                            all_traj.append(oneday_traj)
                            j = j + i
                            break


        #print(all_traj)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user.strip('.csv') + '.txt'
        with open(new_path, 'a',encoding='utf-8') as f:
            for lines in all_traj:
                f.write(lines)
                f.write('\n')

        print(user.strip('.csv') + ' poi_transform done')
def get_stay_points(path1,path2,scale_factor,consistency_range,density_range,window_size):
    user_list = loc_record.curdir_file(path1)

    for user in user_list:
        path = path1 + os.path.sep + user
        userid = user.split('.')[0]
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'], format='%H:%M:%S')

        clusters_list = []
        a = np.zeros(len(data2))
        classified = pd.DataFrame(a, columns=['classified'])
        cluster_id = 1
        new_stay_point_data = pd.DataFrame(columns=['cluster_id', 'index_id', 'lat', 'lon', 'date_string', 'time_string'])


        for i in range(0,len(data2) - 1):
            if classified.iloc[i]['classified'] == 0 :
                classified.iloc[i]['classified'] = 1

                #区域一致性扩展算法
                clusters = []
                seeds = []
                points = []
                clusters.append(i)
                seeds.append(i)
                while seeds != [] :
                    seed = seeds[0]
                    begin = seed
                    time_window = 0
                    for j in range(begin + 1, len(data2) - 1):
                        day_interval = trajectory_extraction.get_day_interval(data2.iloc[begin]['date_string'],
                                                                              data2.iloc[j]['date_string'])
                        if day_interval == 0:

                            if classified.iloc[j]['classified'] == 0:

                                distance_interval = trajectory_extraction.get_distance(data2.iloc[begin]['lat'],
                                                                                   data2.iloc[begin]['lon'],
                                                                                   data2.iloc[j]['lat'],
                                                                                   data2.iloc[j]['lon'])

                                speed = trajectory_extraction.get_speed(data2.iloc[begin]['lat'], data2.iloc[begin]['lon'],
                                                                        data2.iloc[j]['lat'],
                                                                        data2.iloc[j]['lon'],
                                                                        data2.iloc[begin]['time_string'],
                                                                        data2.iloc[j]['time_string'])

                                consistency_value = exp(-(distance_interval / scale_factor) - speed)

                                if consistency_value >= consistency_range :

                                    seeds.append(j)
                                    clusters.append(j)
                                    classified.iloc[j]['classified'] = 1
                                    time_window = 0
                                else:
                                    time_window = time_window + 1
                            else:
                                time_window = 0

                            if time_window == window_size:
                                break

                        if day_interval != 0:
                            break

                    del(seeds[0])

                if len(clusters) >= density_range:
                    for index in clusters:
                        # new_data.append({'a':10,'b':11,'c':12,'d':13},ignore_index=True)
                        new_stay_point_data = new_stay_point_data.append({'cluster_id':cluster_id,'index_id':index,'lat':data1.iloc[index]['lat'],
                                          'lon':data1.iloc[index]['lon'],'date_string':data1.iloc[index]['date_string'],'time_string':data1.iloc[index]['time_string']}, ignore_index=True)
                    cluster_id = cluster_id + 1

        #print(new_stay_point_data)

        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        new_stay_point_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Stay_point_Data done')
def get_important_point(path1,path2,distance_range):
    user_list = loc_record.curdir_file(path1)

    for user in user_list:
        path = path1 + os.path.sep + user
        data1 = pd.read_csv(open(path))
        data2 = pd.read_csv(open(path))
        data2['date_string'] = pd.to_datetime(data2['date_string'], format='%Y-%m-%d')
        data2['time_string'] = pd.to_datetime(data2['time_string'], format='%H:%M:%S')

        impoint_id = 1
        new_impoint_data = pd.DataFrame(columns=['impoint_id','cluster_id','index_id','lat','lon','date_string','time_string'])
        cluster_id = data2['cluster_id'].unique()

        ''''
        lat_avge = data2.groupby(['cluster_id'])[['lat']].mean()
        leave_time = data2.groupby(['cluster_id'])[['time_string']].max()
        date = data2.groupby(['cluster_id'])[['date_string']].max()
        print(date)
        date = data2[data2['cluster_id'] == 1]['lat'].mean()
        print(date)
        '''''
        impoint_id_dict = {1:1}

        #按簇读取
        for i in cluster_id:

            #如果是同一天
            if data2[data2['cluster_id'] == i]['date_string'].max() == data2[data2['cluster_id'] == (i+1)]['date_string'].max() :

                #分别求两点到达和离开时间
                arrive_time1 = data2[data2['cluster_id'] == i]['time_string'].min()
                arrive_time2 = data2[data2['cluster_id'] == (i+1)]['time_string'].min()
                leave_time1 = data2[data2['cluster_id'] == i]['time_string'].max()
                leave_time2 = data2[data2['cluster_id'] == (i + 1)]['time_string'].max()

                #如果两点是时间包含关系
                if (arrive_time1 < arrive_time2 and leave_time1 > leave_time2) or (arrive_time1 > arrive_time2 and leave_time1 < leave_time2) :
                    #和前一个簇合并
                    impoint_id_dict[i+1] = impoint_id

                #如果两个点时间不包含,计算距离
                else:
                    lat1 = data2[data2['cluster_id'] == i]['lat'].mean()
                    lat2 = data2[data2['cluster_id'] == (i+1)]['lat'].mean()
                    lon1 = data2[data2['cluster_id'] == i]['lon'].mean()
                    lon2 = data2[data2['cluster_id'] == (i + 1)]['lon'].mean()

                    distance_interval = trajectory_extraction.get_distance(lat1,lon1,lat2,lon2)
                    #如果满足距离阈值,和前一个簇合并,否则新开一个
                    if distance_interval <= distance_range :
                        impoint_id_dict[i + 1] = impoint_id

                    else:
                        impoint_id = impoint_id + 1
                        impoint_id_dict[i + 1] = impoint_id

            else :
                impoint_id = impoint_id + 1
                impoint_id_dict[i + 1] = impoint_id

        #print(impoint_id_dict)

        #结果合并
        for i in cluster_id :
            #print(impoint_id_dict[i])
            stay_point_data = data1[data1['cluster_id'] == i]
            new_data = stay_point_data.reindex(
            columns=['impoint_id', 'cluster_id', 'index_id', 'lat', 'lon', 'date_string', 'time_string'], fill_value=impoint_id_dict[i])
            new_impoint_data = new_impoint_data.append(new_data,ignore_index=True)

        #结果输出至csv
        if not os.path.exists(path2):  # 检验给出的路径是否存在
            os.makedirs(path2)
        new_path = path2 + os.path.sep + user
        new_impoint_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Important_point_Data done')
def get_imp_movie_point(path_effica,path_clusters,out_path):
    user_list = loc_record.curdir_file(path_effica)

    for user in user_list:

        path_effica1 = path_effica + os.path.sep + user
        path_clusters1 = path_clusters + os.path.sep + user

        effica_data = pd.read_csv(open(path_effica1))
        clusters_data = pd.read_csv(open(path_clusters1))

        a = np.zeros(len(clusters_data))
        new_space_feature_data = pd.DataFrame(
            columns=['lat', 'lon', 'date_string', 'arrive_time', 'leave_time'])
        impoint_id = 1
        i = 0

        #for i in range(begin,len(effica_data)-1):
        while i < len(effica_data)-1 :

            ''''
            impoint_id_index_list = impoint_data[impoint_data.index_id == i].index.tolist()
            impoint_index_id = impoint_id_index_list[0]
            impoint_id = impoint_data.iloc[impoint_index_id]['impoint_id']
            print(type(impoint_id))
            '''
            #在兴趣点队列中
            if impoint_id <= len(clusters_data):

                #兴趣区域,这是用索引的,以后要修改

                arrive_index = int(clusters_data.iloc[impoint_id-1]['arrive_index'])
                leave_index = int(clusters_data.iloc[impoint_id-1]['leave_index'])


                # 如果轨迹点在兴趣区域
                if arrive_index <= i and i <= leave_index:

                    new_data = clusters_data[clusters_data['impoint_id'] == impoint_id][['lat', 'lon', 'date_string', 'arrive_time','leave_time']]
                    new_space_feature_data = new_space_feature_data.append(new_data, ignore_index=True)
                    impoint_id = impoint_id + 1
                    i = leave_index + 1


                #如果轨迹不在兴趣区域
                else:
                    lat = effica_data.iloc[i]['lat']
                    lon = effica_data.iloc[i]['lon']
                    date_string = effica_data.iloc[i]['date_string']
                    arrive_time = effica_data.iloc[i]['time_string']

                    #如果是同一天
                    if effica_data.iloc[i]['date_string'] == effica_data.iloc[i+1]['date_string'] :
                        leave_time =  effica_data.iloc[i+1]['time_string']

                    #如果是轨迹末尾,不是同一天
                    else:
                        leave_time = effica_data.iloc[i]['time_string']

                    #添加移动点
                    new_space_feature_data = new_space_feature_data.append(
                        {'lat': lat, 'lon': lon, 'date_string': date_string, 'arrive_time': arrive_time,
                        'leave_time': leave_time},
                        ignore_index=True)
                    i = i + 1

            #兴趣点末尾,以后要修改优化
            else:
                lat = effica_data.iloc[i]['lat']
                lon = effica_data.iloc[i]['lon']
                date_string = effica_data.iloc[i]['date_string']
                arrive_time = effica_data.iloc[i]['time_string']

                # 如果是同一天
                if effica_data.iloc[i]['date_string'] == effica_data.iloc[i + 1]['date_string']:
                    leave_time = effica_data.iloc[i + 1]['time_string']

                # 如果是轨迹末尾,不是同一天
                else:
                    leave_time = effica_data.iloc[i]['time_string']

                # 添加移动点
                new_space_feature_data = new_space_feature_data.append(
                    {'lat': lat, 'lon': lon, 'date_string': date_string, 'arrive_time': arrive_time,
                     'leave_time': leave_time},
                    ignore_index=True)
                i = i + 1

        if not os.path.exists(out_path):  # 检验给出的路径是否存在
            os.makedirs(out_path)
        new_path = out_path + os.path.sep + user
        new_space_feature_data.to_csv(new_path, index=False, encoding='gbk')
        print(user + ' Imp_movie_point_Data done')