def process_symbolic_data(data, standard_method=True, refined_method=False): """ Given a list of trajectories (regularly sampled location integer symbols) returns the request upper bound(s) on the upper limit of predictability. Returns either a single value or two [standard_method, refined_method]. :param data: List of trajectories :type data: List of List of int :param standard_method: True to calculate the upper bound on the upper limit of predictability using the original method by Song et. al. :type standard_method: Boolean :param refined_method: True to calculate the upper bound on the upper limit of predictability using the refined method from our PERCOM paper. :type refined_method: Boolean """ if refined_method: S_RL, N_RL = empiricalEntropyRate(data, "RL") if standard_method: S_DL, N_DL = empiricalEntropyRate(data, "DL") mlab.openPool() if refined_method: tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0]) if standard_method: tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0]) mlab.closePool() if standard_method: print "\nStandard method: AVG: {} MIN: {} MAX: {}".format( np.mean(np.asarray(tmpG_DL)), np.min(np.asarray(tmpG_DL)), np.max(np.asarray(tmpG_DL)) ) if refined_method: print "\nRefined method: AVG: {} MIN: {} MAX: {}".format( np.mean(np.asarray(tmpG_RL)), np.min(np.asarray(tmpG_RL)), np.max(np.asarray(tmpG_RL)) ) if refined_method and standard_method: return tmpG_DL, tmpG_RL if refined_method: return tmpG_RL return tmpG_DL
def process_symbolic_data(data, standard_method=True, refined_method=False): """ Given a list of trajectories (regularly sampled location integer symbols) returns the request upper bound(s) on the upper limit of predictability. Returns either a single value or two [standard_method, refined_method]. :param data: List of trajectories :type data: List of List of int :param standard_method: True to calculate the upper bound on the upper limit of predictability using the original method by Song et. al. :type standard_method: Boolean :param refined_method: True to calculate the upper bound on the upper limit of predictability using the refined method from our PERCOM paper. :type refined_method: Boolean """ if refined_method: S_RL, N_RL = empiricalEntropyRate(data, 'RL') if standard_method: S_DL, N_DL = empiricalEntropyRate(data, 'DL') mlab.openPool() if refined_method: tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0]) if standard_method: tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0]) mlab.closePool() if standard_method: print '\nStandard method: AVG: {} MIN: {} MAX: {}'.format( np.mean(np.asarray(tmpG_DL)), np.min(np.asarray(tmpG_DL)), np.max(np.asarray(tmpG_DL))) if refined_method: print '\nRefined method: AVG: {} MIN: {} MAX: {}'.format( np.mean(np.asarray(tmpG_RL)), np.min(np.asarray(tmpG_RL)), np.max(np.asarray(tmpG_RL))) if refined_method and standard_method: return tmpG_DL, tmpG_RL if refined_method: return tmpG_RL return tmpG_DL
def run( group = "All",scale = None, output_dir = './ResultsLoP_replication/final_graphs', bulk_build_preprocessing = False): """ Generates a single heatmap for a given list of Geolife ids, for a given method of computing the upper bound on the upper limit of predictability. :param group: ["id_str",[list of ids in the geolife dataset]] :type group: Nested list :param scale: [min_z, max_z, step] Set the scale of the heatmap z :type scale: Float array """ t = time.time() #Group setting if(group == "All"): suffix = "All" persons = "All" else: suffix = "Grp{}".format(group[0]) persons = group[1] if not output_dir[-1] == '/': output_dir = output_dir + '/' file_name = "{}Heatmap_{}".format(output_dir,suffix) ensure_dir(file_name) print "Calculing the LoP for {}".format(suffix) if bulk_build_preprocessing: # will attempt to bulk build the cache using multiple CPU cores # will skip caches if already built. # if this option is not specified and a cache does not exist # it will be built when required, using a single CPU core. GeolifeSymbolisation.bulk_build_resolution_cache(listSpatialRes, listTemporalRes) mlab.openPool() failed_ids = set() LoP_RL = [] LoP_DL = [] LoP_failed_ct = [] passed_norm_test = [] for spatialRes in listSpatialRes: LoP_RL.append([]) LoP_DL.append([]) LoP_failed_ct.append([]) passed_norm_test.append([]) for temporalRes in listTemporalRes: #Compute data #--------------------------------------------- #Load data from an existing preproc database, this will have been created # earlier if it did not exist. data, person_ids = get_geolife_data(spatialRes, temporalRes,persons) #--------------------------------------------- # Sanity check on loading for person in data: if len(person) == 0: raise Exception("One or more person's trajectory was not loaded/created correctly.") # End sanity check S_RL, N_RL = empiricalEntropyRate(data,'RL') S_DL, N_DL = empiricalEntropyRate(data,'DL') #Save the average: tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0]) tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0]) #-88 real fail in solve #-99 known fail in solve when S > log2(N) # See the Matlab script (ParLoP.m) for more details if (np.asarray(tmpG_RL)==-88).any(): raise Exception("ERROR: (RL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured.") if (np.asarray(tmpG_DL)==-88).any(): raise Exception("ERROR: (DL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured.") # Replace known solve fails. These are the cases when an entropy is found that is to high. # This means the LZ entropy rate estimate is wrong (the estimator has failed to converge) # There is no way to correct this, without collecting more data from the individual. # While excluding the individual is not ideal it is better than including a value that is # *known* to be erroneous. Therefore we discard the individual. tmpG_RL = np.asarray(tmpG_RL) tmpG_DL = np.asarray(tmpG_DL) tmpG_RL_known_fail_mask = tmpG_RL < -1 tmpG_DL_known_fail_mask = tmpG_DL < -1 # To be comparable we must arrive at a consistent set of individuals from which to compare both # methods. tmpG_known_fail_mask = np.asarray(tmpG_RL_known_fail_mask) | np.asarray(tmpG_DL_known_fail_mask) #print tmpG_known_fail_mask failed_ct = len(tmpG_RL[tmpG_known_fail_mask]) for p in np.asarray(person_ids)[tmpG_known_fail_mask]: failed_ids.add(p) # Filter out known solve fails. tmpG_RL = list(np.asarray(tmpG_RL)[~tmpG_known_fail_mask]) tmpG_DL = list(np.asarray(tmpG_DL)[~tmpG_known_fail_mask]) if not len(tmpG_RL) == len(tmpG_DL): raise Exception("SHOULD NOT OCCUR 5g4dfg65") if (np.asarray(tmpG_RL) < 0).any(): raise Exception("ERROR. lsdkfal") LoP_RL[-1].append(np.average(tmpG_RL)) LoP_DL[-1].append(np.average(tmpG_DL)) LoP_failed_ct[-1].append( failed_ct ) mlab.closePool() save_results( file_name, LoP_RL, 'RL') save_results( file_name, LoP_DL, 'DL') f2 = file(file_name + "_failed_ct.csv", 'w') print 'failed_ids = {}.'.format( failed_ids ) np.savetxt(f2, LoP_failed_ct,fmt ="%.5f") f2.close() print "Done in {} seconds".format(time.time() - t)
def run(group="All", scale=None, output_dir='./ResultsLoP_replication/final_graphs', bulk_build_preprocessing=False): """ Generates a single heatmap for a given list of Geolife ids, for a given method of computing the upper bound on the upper limit of predictability. :param group: ["id_str",[list of ids in the geolife dataset]] :type group: Nested list :param scale: [min_z, max_z, step] Set the scale of the heatmap z :type scale: Float array """ t = time.time() #Group setting if (group == "All"): suffix = "All" persons = "All" else: suffix = "Grp{}".format(group[0]) persons = group[1] if not output_dir[-1] == '/': output_dir = output_dir + '/' file_name = "{}Heatmap_{}".format(output_dir, suffix) ensure_dir(file_name) print "Calculing the LoP for {}".format(suffix) if bulk_build_preprocessing: # will attempt to bulk build the cache using multiple CPU cores # will skip caches if already built. # if this option is not specified and a cache does not exist # it will be built when required, using a single CPU core. GeolifeSymbolisation.bulk_build_resolution_cache( listSpatialRes, listTemporalRes) mlab.openPool() failed_ids = set() LoP_RL = [] LoP_DL = [] LoP_failed_ct = [] passed_norm_test = [] for spatialRes in listSpatialRes: LoP_RL.append([]) LoP_DL.append([]) LoP_failed_ct.append([]) passed_norm_test.append([]) for temporalRes in listTemporalRes: #Compute data #--------------------------------------------- #Load data from an existing preproc database, this will have been created # earlier if it did not exist. data, person_ids = get_geolife_data(spatialRes, temporalRes, persons) #--------------------------------------------- # Sanity check on loading for person in data: if len(person) == 0: raise Exception( "One or more person's trajectory was not loaded/created correctly." ) # End sanity check S_RL, N_RL = empiricalEntropyRate(data, 'RL') S_DL, N_DL = empiricalEntropyRate(data, 'DL') #Save the average: tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0]) tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0]) #-88 real fail in solve #-99 known fail in solve when S > log2(N) # See the Matlab script (ParLoP.m) for more details if (np.asarray(tmpG_RL) == -88).any(): raise Exception( "ERROR: (RL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured." ) if (np.asarray(tmpG_DL) == -88).any(): raise Exception( "ERROR: (DL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured." ) # Replace known solve fails. These are the cases when an entropy is found that is to high. # This means the LZ entropy rate estimate is wrong (the estimator has failed to converge) # There is no way to correct this, without collecting more data from the individual. # While excluding the individual is not ideal it is better than including a value that is # *known* to be erroneous. Therefore we discard the individual. tmpG_RL = np.asarray(tmpG_RL) tmpG_DL = np.asarray(tmpG_DL) tmpG_RL_known_fail_mask = tmpG_RL < -1 tmpG_DL_known_fail_mask = tmpG_DL < -1 # To be comparable we must arrive at a consistent set of individuals from which to compare both # methods. tmpG_known_fail_mask = np.asarray( tmpG_RL_known_fail_mask) | np.asarray(tmpG_DL_known_fail_mask) #print tmpG_known_fail_mask failed_ct = len(tmpG_RL[tmpG_known_fail_mask]) for p in np.asarray(person_ids)[tmpG_known_fail_mask]: failed_ids.add(p) # Filter out known solve fails. tmpG_RL = list(np.asarray(tmpG_RL)[~tmpG_known_fail_mask]) tmpG_DL = list(np.asarray(tmpG_DL)[~tmpG_known_fail_mask]) if not len(tmpG_RL) == len(tmpG_DL): raise Exception("SHOULD NOT OCCUR 5g4dfg65") if (np.asarray(tmpG_RL) < 0).any(): raise Exception("ERROR. lsdkfal") LoP_RL[-1].append(np.average(tmpG_RL)) LoP_DL[-1].append(np.average(tmpG_DL)) LoP_failed_ct[-1].append(failed_ct) mlab.closePool() save_results(file_name, LoP_RL, 'RL') save_results(file_name, LoP_DL, 'DL') f2 = file(file_name + "_failed_ct.csv", 'w') print 'failed_ids = {}.'.format(failed_ids) np.savetxt(f2, LoP_failed_ct, fmt="%.5f") f2.close() print "Done in {} seconds".format(time.time() - t)