def computeNearestNeighbor(user, username): distances = [] for instance in user: if instance != username: distance = tdist.edr(np.array(user[username]), np.array(user[instance])) #对豪斯多夫距离取对数加1的倒数 # distance=1/(math.log10(distance)+1) distances.append((instance, distance)) distances.sort(key=lambda artistTuple: artistTuple[1], reverse=False) return distances
def compute_dists(trajs): pattern_num = len(trajs) sample_num = len(trajs[0]) res_euc = np.zeros((pattern_num, sample_num)) res_dtw = np.zeros((pattern_num, sample_num)) res_sspd = np.zeros((pattern_num, sample_num)) res_lcss = np.zeros((pattern_num, sample_num)) res_edr = np.zeros((pattern_num, sample_num)) res_erp = np.zeros((pattern_num, sample_num)) res_fre = np.zeros((pattern_num, sample_num)) res_hau = np.zeros((pattern_num, sample_num)) res_hmm = np.zeros((pattern_num, sample_num)) res_aes1 = np.zeros((pattern_num, sample_num)) res_aes2 = np.zeros((pattern_num, sample_num)) for i in range(pattern_num): print('-', i) # patterns for j in range(sample_num): print(i, '-', j) # samples sample1 = trajs[i][j][0] sample2 = trajs[i][j][1] if i > 2: norm_sample1 = resample(trajs[i][j][0]) norm_sample2 = resample(trajs[i][j][1]) res_euc[i, j] = eucdist(norm_sample1, norm_sample2) else: res_euc[i, j] = eucdist(sample1, sample2) res_dtw[i, j] = tdist.dtw(sample1, sample2) res_sspd[i, j] = tdist.sspd(sample1, sample2) res_lcss[i, j] = tdist.lcss(sample1, sample2, eps=0.6 / tlen) res_edr[i, j] = tdist.edr(sample1, sample2, eps=0.6 / tlen) res_erp[i, j] = tdist.erp(sample1, sample2, g=np.zeros(2, dtype=float)) res_fre[i, j] = tdist.frechet(sample1, sample2) res_hau[i, j] = tdist.hausdorff(sample1, sample2) res_hmm[i, j] = hmmdist(sample1, sample2, M=np.array([1, 1, 1, 1, 1, 1])) res_aes1[i, j] = aeowd(sample1, sample2, loss_th=1e-4) res_aes2[i, j] = aebid(sample1, sample2, loss_th=1e-4) results = np.array([res_euc, res_dtw, res_sspd, res_lcss, res_edr, res_erp, \ res_fre, res_hau, res_hmm, res_aes1, res_aes2]) # (12,8,11) return results
for c in range(c_num): print('--', c) idx = np.arange(0, train_num, dtype=np.int)[train_labels == (c + 1)] sample_set = np.array([train_set[i] for i in idx]) train_traj = np.mean(sample_set, axis=0) # (tlen, 2) for j in range(test_num): if j % 3000 == 0: print(j) test_traj = test_set[j] dm_tst_eucl[c, j] = eucdist(train_traj, test_traj) dm_tst_hau[c, j] = tdist.hausdorff(train_traj, test_traj) dm_tst_dtw[c, j] = tdist.dtw(train_traj, test_traj) dm_tst_sspd[c, j] = tdist.sspd(train_traj, test_traj) dm_tst_lcss[c, j] = tdist.lcss(train_traj, test_traj, eps=0.05) dm_tst_edr[c, j] = tdist.edr(train_traj, test_traj, eps=0.05) dm_tst_erp[c, j] = tdist.erp(train_traj, test_traj, g=np.zeros(2, dtype=float)) dm_tst_fre[c, j] = tdist.frechet(train_traj, test_traj) np.savez_compressed('cross_c_dm_tst_eucl', dm_tst_eucl) np.savez_compressed('cross_c_dm_tst_dtw', dm_tst_dtw) np.savez_compressed('cross_c_dm_tst_sspd', dm_tst_sspd) np.savez_compressed('cross_c_dm_tst_lcss', dm_tst_lcss) np.savez_compressed('cross_c_dm_tst_edr', dm_tst_edr) np.savez_compressed('cross_c_dm_tst_erp', dm_tst_erp) np.savez_compressed('cross_c_dm_tst_fre', dm_tst_fre) np.savez_compressed('cross_c_dm_tst_hau', dm_tst_hau)
def searchResult(query, train, query_num, user_k, qgram_size): logger = logging.getLogger('predict') logger.setLevel(logging.DEBUG) fh = logging.FileHandler('./log/%s' % query) fh.setLevel(logging.DEBUG) # create console handler with a higher log level ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) logger.addHandler(ch) # loading the files: logger.info('---------------------------- Predict the top-k similar trajectories ----------------------------') qgram_tag = 'q_%d' % qgram_size query_path = './data/processed/%s.txt' % query train_path = './data/processed/%s.txt' % train candidate_traj_path = './data/interim/%s/%s/candidate_trajectory_%s.txt' % (query, train, qgram_tag) query_id_dict_path = './data/interim/%s/%s/query_id_dict_%s.txt' % (query, train, qgram_tag) rtree_id_dict_path = './data/interim/%s/rtree_id_dict_%s.txt' % (train, qgram_tag) result_path = './data/result/%s/%s/%s/' % (query, train, qgram_tag) stats_path = './data/stats/%s/%s/edr_count_%s.txt' % (query, train, qgram_tag) if not os.path.exists(result_path): os.makedirs(result_path) f_stats = open(stats_path, 'w') f_stats.write('query edr_count candidate_count\n') candidateList = read_pickle(candidate_traj_path) # candidateList => [[queryID_1,[(traID1, count1),(traID2, count2)]], [...]] logger.info('Load candidate trajectory: %s' % candidate_traj_path) query_id_to_key = read_pickle(query_id_dict_path) logger.info('Load query id dictionary: %s' % query_id_dict_path) rtree_id_to_key = read_pickle(rtree_id_dict_path) logger.info('Load rtree id dictionary: %s' % rtree_id_dict_path) trajectory_dict = load_trajectory(train_path) logger.info('Load train trajectory: %s' % train_path) real_query_dict = load_trajectory(query_path, n=query_num) logger.info('Load %d query trajectory: %s' % (query_num, query_path)) query_key_to_id = swap_k_v(query_id_to_key) # key: encoded key; value: trajectory id in string rtree_key_to_id = swap_k_v(rtree_id_to_key) # key: encoded key; value: trajectory id in string edr_count_result=[] logger.info('Start finding top K') for index in range(len(candidateList)): # start to calculate edr_count = 0 k = min(user_k, len(candidateList[index][1])) topK = candidateList[index][1][0:k] queryID = candidateList[index][0] print('%d, query id: %d' % (index, queryID)) pre_result = list(map(lambda x: x[0], topK)) # get the candidate trajectory IDs from top k # print(queryID) # print(pre_result) result_map = {} # build a map to save the result for t in pre_result: edr_count += 1 # result_map[t] = calculateEdr(trajectory_dict[rtree_id_dict[t]], real_query_dict[query_id_dict[queryID]]) result_map[t] = tdist.edr(np.array(trajectory_dict[rtree_key_to_id[t]]), np.array(real_query_dict[query_key_to_id[queryID]]), "spherical")*max(len(trajectory_dict[rtree_key_to_id[t]]),len(real_query_dict[query_key_to_id[queryID]])) # print(result_map) fullCandidates = candidateList[index][1] # list of [ID, count] i = k query_tra = real_query_dict[query_key_to_id[queryID]] lengthQ = len(query_tra) bestSoFar = result_map[topK[i-1][0]] while i < len(fullCandidates): candidate = fullCandidates[i] candidateID = candidate[0] tra_s = trajectory_dict[rtree_key_to_id[candidateID]] countValue = candidate[1] lengthS = len(tra_s) if countValue >= (max(lengthQ, lengthS) - (bestSoFar+1)*qgram_size): # pointedByCounts = filter(lambda e:e[1]==countValue, fullCandidates) # for s in pointedByCounts: realDist = tdist.edr(np.array(tra_s), np.array(query_tra), "spherical")*max(len(tra_s), len(query_tra)) edr_count += 1 if realDist < bestSoFar: result_map[candidateID] = realDist bestSoFar = sorted(result_map.items(), key=lambda kv: (kv[1], kv[0]))[k-1][1] # update the best so far else: break i += 1 finalResult = sorted(result_map.items(), key=lambda kv: (kv[1], kv[0]))[0:k] with open(result_path + "/query_%s.txt" % queryID, 'w') as f: f.write(query_key_to_id[queryID] + '\n') f.write('\n'.join('{} {}'.format(item[0], item[1]) for item in finalResult)) f.close() gc.collect() edr_count_result.append("query_%s.txt %d %d" % (queryID, edr_count, len(candidateList[index][1]))) logger.info('Finished') f_stats.write('\n'.join(edr_count_result)) f_stats.close()
def main(query, train, query_num): logger = logging.getLogger('build_truth') logger.setLevel(logging.DEBUG) fh = logging.FileHandler('./log/%s' % query) fh.setLevel(logging.DEBUG) # create console handler with a higher log level ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) logger.addHandler(ch) logger.info( '---------------------------- Search for ground truth ----------------------------' ) query_path = './data/processed/%s.txt' % query train_path = './data/processed/%s.txt' % train query_id_dict_path = './data/interim/%s/%s/query_id_dict.txt' % (query, train) rtree_id_dict_path = './data/interim/%s/rtree_id_dict.txt' % train result_path = './data/truth/%s/%s' % (query, train) if not os.path.exists(result_path): os.makedirs(result_path) query_id_dict = read_pickle(query_id_dict_path) # fakeID -> realID logger.info('Load query id dictionary: %s' % query_id_dict_path) train_id_dict = read_pickle(rtree_id_dict_path) logger.info('Load train id dictionary: %s' % rtree_id_dict_path) query_data = load_trajectory(query_path, n=query_num) logger.info('Load %d query trajectory: %s' % (query_num, query_path)) train_data = load_trajectory(train_path) # realID -> vectors logger.info('Load train trajectory: %s' % train_path) train_id_dict = {v: k for k, v in train_id_dict.items()} # print(train_id_dict) # result = [] for query_id, query_trajectory in query_data.items(): query_key = query_id_dict[query_id] if query_key in [36, 41]: print(query_key) distance_list = [] train_key_list = [] for train_id, train_trajectory in train_data.items(): try: train_key = train_id_dict[train_id] distance = tdist.edr( np.array(train_trajectory), np.array(query_trajectory), "spherical") * max(len(train_trajectory), len(query_trajectory)) distance_list.append(distance) train_key_list.append(train_key) except KeyError: pass ix = sorted(range(len(distance_list)), key=lambda k: distance_list[k]) distance_list_sorted = [distance_list[i] for i in ix] train_key_sorted = [train_key_list[i] for i in ix] trajectory_result = [ (e1, e2) for e1, e2 in zip(train_key_sorted, distance_list_sorted) ] with open(result_path + "/query_%s.txt" % query_key, 'w') as f: f.write(query_id + '\n') f.write('\n'.join('{} {}'.format(item[0], item[1]) for item in trajectory_result)) f.close() gc.collect() # result.append([query_key, trajectory_result]) logger.info('Finished building ground truth') return
[-122.41252, 37.79232], [-122.41316, 37.7951], [-122.41392, 37.7989], [-122.41435, 37.80129], [-122.41434, 37.80129]]) traj_B = np.array([[-122.39472, 37.77672], [-122.3946, 37.77679], [-122.39314, 37.77846], [-122.39566, 37.78113], [-122.39978, 37.78438], [-122.40301, 37.78708], [-122.4048, 37.78666], [-122.40584, 37.78564], [-122.40826, 37.78385], [-122.41061, 37.78321], [-122.41252, 37.78299]]) traj_C = np.array([[-122.39542, 37.77665], [-122.3988, 37.77417], [-122.41042, 37.76944], [-122.41459, 37.77016], [-122.41462, 37.77013]]) traj_list = [traj_A, traj_B, traj_C] import traj_dist.distance as tdist # Simple distance # dist = tdist.edr(traj_A, traj_A) print(dist) # Pairwise distance # pdist = tdist.pdist(traj_list,metric="sspd") # print(pdist) # Distance between two list of trajectories # # cdist = tdist.cdist(traj_list, traj_list,metric="sspd") # print(cdist)
for j in range(i+1,len(ends)): dist[i,j] = np.linalg.norm(ends.iloc[i] - ends.iloc[j]) dist[j,i] = dist[i,j] with open('../data/euclidean_matrix_'+data+'.pickle','wb') as f: pickle.dump(dist,f,protocol=pickle.HIGHEST_PROTOCOL) elif metric == 'EDR': trajs = df['POLYLINE'] threshold = 0.1 # in km for i in range(len(trajs)-1): n_i = df['LENGTH'].iloc[i] for j in range(i+1,len(trajs)): n_j = df['LENGTH'].iloc[j] dist[i,j] = edr(trajs.iloc[i],trajs.iloc[j], eps = threshold) * max(n_i,n_j) dist[j,i] = dist[i,j] elif metric == 'DTW': trajs = df['POLYLINE'] DTW = np.zeros((len(trajs),len(trajs)),dtype=np.float32) for i in range(len(trajs)-1): n_i = df['LENGTH'].iloc[i] for j in range(i+1,len(trajs)): n_j = df['LENGTH'].iloc[j] dist[i,j] = dtw(trajs.iloc[i],trajs.iloc[j]) dist[j,i] = dist[i,j] else: print('metric should be "EDR","DTW" or "euclidean"')