def class_by_time(svm_data, grid_2_id): print 'svm' trips = [] labels = [] for datas in svm_data: trip = datas[0] time = datas[1] data = [] # 时间特征 for i in time: data.append(int(i)) p = random.randint(40, 70) (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(trip[0]) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(trip[int(len(trip) * float(p) / 100.0)]) data.append(lat1) data.append(lon1) data.append(lat2) data.append(lon2) print data trips.append(data) labels.append(grid_2_id[trip[-1]]) # print len(trips) # print trips lin_clf = svm.LinearSVC() lin_clf.fit(trips, labels) joblib.dump(lin_clf, "../../data/lin_clf.model") print 'finish svm' return lin_clf
def compute_MT(A, M, grid_2_id, id_2_grid): print 'computing MT' MT = np.zeros((1584, 1584)) sortlist = defaultdict(list) for i in id_2_grid: for j in id_2_grid: if i == j: continue x = id_2_grid[i] y = id_2_grid[j] (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(x) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(y) sortlist[(abs(lat1 - lat2) + abs(lon1 - lon2))].append([i, j]) # Mpow = np.eye(1584) Mpow = gnp.garray(np.eye(1584)) M = gnp.garray(M) #print Mpow for i in sortlist: # print i Mpow = Mpow.dot(M) Lde = int(i * 0.2) Mtemp = Mpow.dot(A[Lde]) # np.dot(Mpow, A[Lde]) # print 'finish' # print Mtemp for x in sortlist[i]: MT[x[0], x[1]] = Mtemp[x[0], x[1]] # print np.max(MT), np.unravel_index(MT.argmax(), MT.shape) #print MT return MT
def subsyn(test_data, trip_data, test_des, M, MT, des, test_time, id_2_grid): print 'subsyn' idx = 0 total_error = [] total_km = 0 gtotal_error = [] gtotal_km = 0 for datas in test_data: # 计算P(Tp) # Ptp = 100000000 # pre = -1 # for point in datas: # if pre < 0: # pre = point # continue # Ptp = Ptp * M[pre, point] # # 计算P(T^p|d \in n_j) Ptpnj = defaultdict(float) for j in des: if MT[datas[0], j] <= 0: continue try: Ptpnj[j] = MT[datas[-1], j] / MT[datas[0], j] except: print '' P = defaultdict(float) sum = 0.0 for j in Ptpnj: P[j] = Ptpnj[j] * des[j] sum += P[j] # 改进subsyn gP = defaultdict(float) tmp_time = [] for i in test_time[idx]: tmp_time.append(int(i)) (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(id_2_grid[datas[0]]) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(id_2_grid[datas[-1]]) predictdata = tmp_time + [lat1, lon1, lat2, lon2] label = svm_time.predict_by_svm(data=[predictdata]) for j in Ptpnj: gP[j] = Ptpnj[j] * des[j] * 0.7 if label[0] == j: gP[j] += 0.3 gP = sorted(gP.iteritems(), key=lambda (k, v): (v, k), reverse=True) P = sorted(P.iteritems(), key=lambda (k, v): (v, k), reverse=True) Q = [] for k, v in P: Q.append(int(k)) gQ = [] for k, v in gP: gQ.append(int(k)) # print test_des[idx], Q[:5] # 计算涵盖率 yes_list = [] # print test_des[idx], Q[:5] if test_des[idx] in Q[:1]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in Q[:3]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in Q[:5]: yes_list.append(1) else: yes_list.append(0) # print data, max_ID, max_P, test_des[idx] total_error.append(yes_list) # 计算误差曼哈顿距离 # print id_2_grid[int(test_des[idx])] (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(id_2_grid[int(test_des[idx])]) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(id_2_grid[Q[0]]) total_km += abs(lat1 - lat2) + abs(lon1 - lon2) # 计算改进subsyn # print test_des[idx], Q[:5] # 计算涵盖率 yes_list = [] # print test_des[idx], Q[:5] if test_des[idx] in gQ[:1]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in gQ[:3]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in gQ[:5]: yes_list.append(1) else: yes_list.append(0) # print data, max_ID, max_P, test_des[idx] gtotal_error.append(yes_list) # 计算误差曼哈顿距离 # print id_2_grid[int(test_des[idx])] (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(id_2_grid[int(test_des[idx])]) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(id_2_grid[gQ[0]]) gtotal_km += abs(lat1 - lat2) + abs(lon1 - lon2) idx += 1 P1 = P3 = P5 = 0.0 for data in total_error: P1 += data[0] P3 += data[1] P5 += data[2] print P1, P3, P5 print P1 / len(test_des), P3 / len(test_des), P5 / len(test_des) print total_km * 1.0 / len(test_des) P1 = P3 = P5 = 0.0 for data in gtotal_error: P1 += data[0] P3 += data[1] P5 += data[2] print P1, P3, P5 print P1 / len(test_des), P3 / len(test_des), P5 / len(test_des) print gtotal_km * 1.0 / len(test_des)
def ZMDB(test_data, trip_data, test_des, id_2_grid): print 'ZMDB' # print test_des total_error = [] total_km = 0 try: idx = 0 for data in test_data: des_num = defaultdict(int) num = 0 # 计算 每个查询轨迹:满足目的地为n^j 且 查询轨迹匹配trip_data的数目:des_num 总的匹配轨迹为num for trip in trip_data: tmp = testINtrip(data, trip) num += tmp des_num[trip[-1]] += tmp if num <= 0: idx += 1 continue # print num max_P = -1 max_ID = -1 # 对每个目的地而言, P(n^j | T^end(np.eye(1584))p) P = defaultdict(float) for i in des_num: P[i] = des_num[i] * 1.0 / num if max_P < P[i]: max_P = P[i] max_ID = i P = sorted(P.iteritems(), key=lambda (k, v): (v, k), reverse=True) Q = [] for k, v in P: Q.append(int(k)) # 计算涵盖率 yes_list = [] # print test_des[idx], Q[:5] if test_des[idx] in Q[:1]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in Q[:3]: yes_list.append(1) else: yes_list.append(0) if test_des[idx] in Q[:5]: yes_list.append(1) else: yes_list.append(0) # print data, max_ID, max_P, test_des[idx] total_error.append(yes_list) # 计算误差曼哈顿距离 # print test_des[idx] # print id_2_grid[int(test_des[idx])] (lat1, lon1, lat_length, lon_length) = gh._decode_c2i(id_2_grid[int(test_des[idx])]) (lat2, lon2, lat_length, lon_length) = gh._decode_c2i(id_2_grid[Q[0]]) total_km += abs(lat1 - lat2) + abs(lon1 - lon2) idx += 1 except: # print test_des[idx], Q[0] s = sys.exc_info() print "Error '%s' happened on line %d" % (s[1], s[2].tb_lineno) P1 = P3 = P5 = 0.0 for data in total_error: P1 += data[0] P3 += data[1] P5 += data[2] print P1, P3, P5 print P1 / len(test_des), P3 / len(test_des), P5 / len(test_des) print total_km * 1.0 / len(test_des)