def traj_obs(print_num=1000): """ Returns the trajectory observations. If the obs have never been computed before, also stores them in a file. Otherwise reads the cached copy from the disk. """ fname = "%s/traj_obs.pkl"%experiment_directory(experiment_name) fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name) if not os.path.exists(fname): tic("traj_obs: Saving trajectory obs in %s"%fname, experiment_name) if num_jobs == 1: seq = (traj_ob for date in dates for traj_ob in getDayTrajs(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], experiment_design['trajectory_conversion'], traj_conv(), net)) else: from joblib import Parallel, delayed tic("Using concurrent job code with {0} jobs".format(num_jobs),"learn_procedure") ls = Parallel(n_jobs=num_jobs)(delayed(wrapper)(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], experiment_design['trajectory_conversion'], traj_conv(), net) for date in dates) seq = [traj_ob for l in ls for traj_ob in l] # seq = (traj_ob for tspots_seq in tspots_seqs() # for traj_ob in traj_conv().mapTrajectory(tspots_seq)) kfold_cross_validation = data_source['kfold_cross_validation'] test_k = data_source['test_k'] assert kfold_cross_validation == 0 or test_k < kfold_cross_validation f = open(fname, 'w') if kfold_cross_validation > 0: tic("traj_obs: Saving test trajectory obs in %s"%fname_test, experiment_name) f_test = open(fname_test, 'w') idx = 0 for traj_ob in seq: idx += 1 if print_num > 0 and idx % print_num == 0: tic("traj_obs: Converted so far {0} observations".format(idx), experiment_name) if kfold_cross_validation > 0 and idx % kfold_cross_validation == test_k: s_dump_elt(traj_ob, f_test) else: s_dump_elt(traj_ob, f) yield traj_ob else: tic("traj_obs: opening trajectory obs in %s"%fname, experiment_name) f = open(fname, 'r') for traj_ob in s_load(f): yield traj_ob
def test_traj_obs(experiment_name,print_counter=1000): fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name) tic("test_traj_obs: opening test trajectory obs in %s"%fname_test, experiment_name) f = open(fname_test, 'r') c = 0 for traj_ob in s_load(f): c += 1 if print_counter > 0 and c % print_counter == 0: tic("test_traj_obs: Consumed so far {0} observations".format(c), experiment_name) yield traj_ob
def gmrf(): global gmrf_ if not gmrf_: if not os.path.exists(gmrf_fname): tic("creating empty gmrf", experiment_name) gmrf_ = emptyValues(tt_graph()) else: tic("reading gmrf from %s"%gmrf_fname, experiment_name) gmrf_ = pickle.load(open(gmrf_fname,'r')) return gmrf_
def mode_counts(): global mode_counts_ if not mode_counts_: tic("Loading trajectory conversion...") fname = "%s/mode_count.pkl"%experiment_directory(experiment_name) if not os.path.exists(fname): pickle.dump(traj_conv().modeCounts(), open(fname,'w')) mode_counts_ = pickle.load(open(fname,'r')) tic("Done loading trajectory conversion and mode counts") return mode_counts_
def tt_graph(): global tt_graph_ if not tt_graph_: if not os.path.exists(tt_graph_fname): tic("creating empty tt graph", experiment_name) tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4) tt_graph_.checkInvariants() save_ttg_structure(tt_graph_, experiment_name=experiment_name) else: tic("reading tt graph from %s"%tt_graph_fname, experiment_name) tt_graph_ = pickle.load(open(tt_graph_fname,'r')) return tt_graph_
def covsel_cvx_cholmod( R, U, rows, cols, k, psd_tolerance=1e-6, factor=None, num_iterations=500, finish_early=True, debug=True ): if debug: tic("smallest ev", "covsel_cvx_cholmod") min_ei = smallest_ev_arpack(R, U, rows, cols) if debug: tic("min_ei is %f" % min_ei, "covsel_cvx_cholmod") if min_ei < 0: R0 = R - min_ei + 1e-3 else: R0 = R return run_cvx_cholmod(R0, U, rows, cols, k, psd_tolerance, factor, num_iterations, finish_early)
def hmm_graph(): global hmm_graph_ if hmm_graph_ is None: if not os.path.exists(hmm_graph_fname): if graph_type == 'simple': hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts()) else: # Complex model not implemented assert False else: tic("Reading completed hmm graph from %s"%hmm_graph_fname) hmm_graph_ = pickle.load(open(hmm_graph_fname,'r')) return hmm_graph_
def validation_procedure(experiment_design, experiment_design_indep, experiment_design_one_mode, experiment_design_one_mode_indep, validate_on_network=True): # Get the validation data experiment_name = experiment_design['name'] # We will load the test data from this experiment experiment_name_one_mode = experiment_design_one_mode['name'] # Get the network basic_geometry = experiment_design['basic_geometry'] # Nearly everything will need a network. net = get_network(**basic_geometry) # reload the HMM and the GMRF estimator from the files # All we need for testing is a experiment_design test_hmm = read_hmm_pickle(experiment_name) # Read the estimator gmrf_estimation = experiment_design['gmrf_estimation'] test_gmrf_estimator = get_gmrf_estimator(experiment_name, gmrf_estimation['process']) # Baseline Gaussian independent test_hmm_one_mode = read_hmm_pickle('{0}_one_mode'.format(experiment_name)) test_gmrf_one_mode_indep = get_gmrf_estimator('{0}_one_mode_indep'.format(experiment_name), 'diagonal') # Baseline Gaussian test_gmrf_one_mode = get_gmrf_estimator('{0}_one_mode'.format(experiment_name), gmrf_estimation['process']) # Baseline MultiMode Gaussian independent test_gmrf_indep = get_gmrf_estimator('{0}_indep'.format(experiment_name), 'diagonal') tic('Validation') test_traj_obs_all = list(test_traj_obs(experiment_name)) test_traj_obs_one_mode_all = list(test_traj_obs(experiment_name_one_mode)) tic("Validation set: {0} trajectories".format(len(test_traj_obs_all))) model = [(test_traj_obs_all, test_gmrf_estimator, test_hmm, 'MM-GMRF')] baseline1 = [(test_traj_obs_one_mode_all, test_gmrf_one_mode_indep, test_hmm_one_mode, 'one mode indep')] baseline2 = [(test_traj_obs_one_mode_all, test_gmrf_one_mode, test_hmm_one_mode, 'one mode')] baseline3 = [(test_traj_obs_all, test_gmrf_indep, test_hmm, 'multi-modal indep')] val_model = model + baseline1 + baseline2 + baseline3 tic('path validation') validate.validate_on_paths(val_model, net, estimation_sampling_process=experiment_design['estimation_sampling']['process'], estimation_sampling_parameters=experiment_design['estimation_sampling']['parameters'], **experiment_design['evaluation']) if validate_on_network: tic('network validation') validate.validate_on_network(val_model, net, estimation_sampling_process=experiment_design['estimation_sampling']['process'], estimation_sampling_parameters=experiment_design['estimation_sampling']['parameters'], **experiment_design['evaluation']) tic("Evaluation finished")
def getDayTSpots(date, network): """ Returns a list of sequences of TSpot objects for this day. """ all_traj_fns = list_traj_filenames(date) tspots_groups = [] for fname in all_traj_fns: # pylint:disable=W0142 try: tspots = read_trajectory(fname) except IOError: tic("Could not read trajectory: {0}".format(fname), "getDayTSpots") tspots = [] # tspots is a list of TSpot # Make sure we only have data for our sub network. for net_tspots in filterOutsideNetwork(tspots, network): tspots_groups.append(net_tspots) return tspots_groups
def fillTrajectoryCache(graph_type,basic_geometry,data_source,traj_conv_description,n_jobs=1): net = get_network(**basic_geometry) tic("Loaded network = {0} links".format(len(net)), "fillTrajectoryCache") traj_conv = createTrajectoryConversion(graph_type=graph_type, process=traj_conv_description['process'], params=traj_conv_description['params'], network=net, max_nb_mixture=traj_conv_description['params']['max_n_modes'], n_jobs=n_jobs) dates = data_source['dates'] from joblib import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(wrapper)(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], traj_conv_description, traj_conv, net) for date in dates)
def gmrf_learn_cov_cholmod( R, U, rows, cols, edge_count, k, min_variance=1e-2, min_edge_count=10, num_iterations=50, psd_tolerance=1e-3, finish_early=True, ): n = len(R) m = len(U) mask = edge_count >= min_edge_count active_m = np.sum(mask) tic("m={0}, active m={1}".format(m, active_m), "gmrf_learn_cov_cholmod") active_U = U[mask] active_rows = rows[mask] active_cols = cols[mask] # A number of variables hare independant (due to lack of observations) independent_mask = independent_variables(n, active_rows, active_cols) # Put them aside and use the independent strategy to solve them. indep_idxs = np.arange(n)[independent_mask] R_indep = R[indep_idxs] # Solve the regularized version for independent variables D_indep = 1.0 / np.maximum(min_variance * np.ones_like(R_indep), R_indep) # Putting together the dependent and independent parts D = np.zeros_like(R) D[independent_mask] = D_indep P = np.zeros_like(U) # No need to solve for the outer diagonal terms, they are all zeros. # Solve for the dependent terms dependent_mask = ~independent_mask n_dep = np.sum(dependent_mask) if n_dep > 0: idxs_dep = np.arange(n)[dependent_mask] reverse_idxs_dep = np.zeros(n, dtype=np.int64) reverse_idxs_dep[dependent_mask] = np.arange(n_dep) rows_dep = reverse_idxs_dep[active_rows] cols_dep = reverse_idxs_dep[active_cols] R_dep = R[idxs_dep] U_dep = active_U (M, R_hat, U_hat) = normalized_problem(R_dep, U_dep, rows_dep, cols_dep) tic("Computing symbolic cholesky factorization of the graph...", "gmrf_learn_cov_cholmod") # Doing delayed import so that the rest of the code runs without sk-learn from scikits.sparse.cholmod import analyze Xs_dep = build_sparse(np.ones_like(R_hat), np.ones_like(U_hat), rows_dep, cols_dep) factor = analyze(Xs_dep) tic("Cholesky done", "gmrf_learn_cov_cholmod") # TODO add the other parameters (D_norm_dep, P_norm_dep) = covsel_cvx_cholmod( R_hat, U_hat, rows_dep, cols_dep, k, psd_tolerance, factor, num_iterations, finish_early ) D[dependent_mask] = D_norm_dep / (M ** 2) P[mask] = P_norm_dep / (M[rows_dep] * M[cols_dep]) return (D, P)
def getDayTTOBservations(date, network, print_stats=1000): """ """ all_trajs = list_traj_filenames(date) all_groups = [] idx = 1 for traj_index in all_trajs: idx += 1 if print_stats > 0 and idx % print_stats == 0: tic("processed {0} observations".format(idx),"getDayTTOBservations") # pylint:disable=W0142 try: tspots = read_trajectory(traj_index) except IOError: tic("ioerror when loading a trajectory {0}".format(traj_index), "getDayTTOBservations") tspots = [] # Make sure we only have data for our sub network. for net_tspots in filterOutsideNetwork(tspots, network): groups = seqGroupBy(net_tspots, keyf=lambda tsp:tsp.spot.linkId) for g in completeGroups(groups, network): all_groups.append(g) return all_groups
def getMixtures(dates, network, max_n_links=None, return_tts=False, max_nb_mixture=4,num_threads=1): tic("Running with {0} jobs.".format(num_threads),"getMixtures") ttob_seqs = (ttob_seq for date in dates for ttob_seq in getDayTTOBservations(date, network)) # Get travel times for each link all_ttobs = (ttob for ttob_seq in ttob_seqs for ttob in ttob_seq) tic("starting groupby...","getMixtures") all_ttobs_by_lid = sorted([(lid, list(vals)) for (lid, vals) in groupby(all_ttobs, lambda ttob:ttob.linkId)], key=lambda z:-len(z[1])) tic("groupby done, {0} links".format(len(all_ttobs_by_lid)),"getMixtures") if max_n_links: all_ttobs_by_lid = all_ttobs_by_lid[:max_n_links] tts_by_link = [(lid, np.array([tto.tt for tto in vals])) for (lid, vals) in all_ttobs_by_lid] tic("vectorization done","getMixtures") if num_threads != 1: tic("Running with {0} jobs.".format(num_threads),"getMixtures") learned_mixtures = Parallel(n_jobs=num_threads,verbose=10)(delayed(getMixtures_inner)((lid, tts, max_nb_mixture)) for (lid, tts) in tts_by_link) else: learned_mixtures = [(lid, learnMixtureAuto(tts, max_nb_mixture)) for (lid, tts) in tts_by_link] if return_tts: return (dict(learned_mixtures), dict(tts_by_link)) else: return dict(learned_mixtures)
def gmrf_learn_cov_cvx(R, U, rows, cols, edge_count, min_variance=1e-2, min_edge_count=10, num_iterations=50): n = len(R) m = len(U) mask = edge_count >= min_edge_count active_m = np.sum(mask) tic("m={0}, active m={1}".format(m, active_m), "gmrf_learn_cov_cvx") active_U = U[mask] active_rows = rows[mask] active_cols = cols[mask] # A number of variables hare independant (due to lack of observations) independent_mask = independent_variables(n, active_rows, active_cols) # Put them aside and use the independent strategy to solve them. indep_idxs = np.arange(n)[independent_mask] R_indep = R[indep_idxs] # Solve the regularized version for independent variables D_indep = 1.0 / np.maximum(min_variance * np.ones_like(R_indep), R_indep) # Putting together the dependent and independent parts D = np.zeros_like(R) D[independent_mask] = D_indep P = np.zeros_like(U) # No need to solve for the outer diagonal terms, they are all zeros. # Solve for the dependent terms dependent_mask = ~independent_mask n_dep = np.sum(dependent_mask) if n_dep > 0: idxs_dep = np.arange(n)[dependent_mask] reverse_idxs_dep = np.zeros(n, dtype=np.int64) reverse_idxs_dep[dependent_mask] = np.arange(n_dep) rows_dep = reverse_idxs_dep[active_rows] cols_dep = reverse_idxs_dep[active_cols] R_dep = R[idxs_dep] U_dep = active_U (M, R_hat, U_hat) = normalized_problem(R_dep, U_dep, rows_dep, cols_dep) (D_norm_dep, P_norm_dep) = covsel_cvx_dense(R_hat, U_hat, rows_dep, cols_dep, num_iterations=num_iterations) D[dependent_mask] = D_norm_dep / (M ** 2) P[mask] = P_norm_dep / (M[rows_dep] * M[cols_dep]) return (D, P)
def run_cvx_cholmod( R_hat, U_hat, rows, cols, k, psd_tolerance=1e-6, factor=None, num_iterations=500, finish_early=True, debug=True ): D = np.ones_like(R_hat) P = np.zeros_like(U_hat) for iters in range(num_iterations): tic("Iter={0}".format(iters), "run_cvx_cholmod") # Debug # f1 = obj_dense(R_hat, U_hat, rows, cols, D, P) # f2 = obj_cholmod(R_hat, U_hat, rows, cols, D, P, psd_tolerance, factor) # delta = f1 - f2 # print "True objective value:",f1 # print "This objective value",f2 # print "Difference",delta # End debug z = iter_cholmod(R_hat, U_hat, rows, cols, D, P, k, psd_tolerance, factor) if finish_early and z is None: tic("done early", "run_cvx_cholmod") return (D, P) if z is not None: (D2, P2, fn, lsiter) = z D = D2 P = P2 return (D, P)
def iter_cholmod(R_hat, U_hat, rows, cols, D, P, k, psd_tolerance=1e-6, factor=None, num_lsearch_iter=10): tic("computing gradient", "iter_cholmod") (g_D, g_P) = grad_cholmod(R_hat, U_hat, rows, cols, D, P, k, factor) v_D = -g_D v_P = -g_P # Debug # (g_D_, g_P_) = grad_dense(R_hat, U_hat, rows, cols, D, P) # print 'g_D diff:',la.norm(g_D-g_D_) # print 'g_P diff:',la.norm(g_P-g_P_) # End debug # Stopping criterion: sqntdecr = -v_D.dot(g_D) - v_P.dot(g_P) tic("Newton decrement squared:%- 7.5e" % sqntdecr, "iter_cholmod") if sqntdecr < 1e-8: return None # line search dD = v_D dP = v_P s = 1.0 f = obj_cholmod(R_hat, U_hat, rows, cols, D, P, psd_tolerance, factor) tic("Current objective value: {0}".format(f), "iter_cholmod") for lsiter in range(num_lsearch_iter): curr_D = D + s * dD curr_P = P + s * dP fn = obj_cholmod(R_hat, U_hat, rows, cols, curr_D, curr_P, psd_tolerance, factor) # print 'fn ',fn tic("lsiter={0} fn={1}".format(lsiter, fn), "iter_cholmod") if fn == -np.Infinity: s *= 0.5 else: if fn < f - 0.01 * s * sqntdecr: tic("Update lsiter={0}".format(lsiter), "iter_cholmod") return (curr_D, curr_P, fn, lsiter) s *= 0.5 print "Too many iterations" return None
def wrapper(*args,**kwargs): tic("Wrapper called") res = getDayTrajs(*args, **kwargs) tic("Cached {0} trajs".format(len(res))) return res
def learning_perf(experiment_design): """ Starting the main procedure. This script tries to reuse as much as it can from the disk, to avoid expensive recomputations. """ experiment_name = experiment_design['name'] # Get the network basic_geometry = experiment_design['basic_geometry'] # Nearly everything will need a network. net = get_network(**basic_geometry) tic("Loaded network = {0} links".format(len(net)), experiment_name) graph_type = experiment_design['graph_type'] traj_conv_param = experiment_design['trajectory_conversion']['params'] # Trajectory conversion # Needed early because it gives the number of modes. global traj_conv_ traj_conv_ = None def traj_conv(): global traj_conv_ if not traj_conv_: traj_conv_ = createTrajectoryConversion(graph_type=graph_type, process=experiment_design['trajectory_conversion']['process'], params=traj_conv_param, network=net, max_nb_mixture=traj_conv_param['max_n_modes']) return traj_conv_ # Number of modes # Also stored on disk as pickle global mode_counts_ mode_counts_ = None def mode_counts(): global mode_counts_ if not mode_counts_: tic("Loading trajectory conversion...") fname = "%s/mode_count.pkl"%experiment_directory(experiment_name) if not os.path.exists(fname): pickle.dump(traj_conv().modeCounts(), open(fname,'w')) mode_counts_ = pickle.load(open(fname,'r')) tic("Done loading trajectory conversion and mode counts") return mode_counts_ # The HMM graph global hmm_graph_ hmm_graph_ = None hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name) def hmm_graph(): global hmm_graph_ if hmm_graph_ is None: if not os.path.exists(hmm_graph_fname): if graph_type == 'simple': tic("creating empty hmm graph", experiment_name) hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts()) tic("done creating empty hmm graph", experiment_name) tic("saving hmm graph pickle", experiment_name) pickle.dump(hmm_graph(),open(hmm_graph_fname,'w')) tic("done saving hmm graph pickle", experiment_name) else: # Complex model not implemented assert False else: tic("Reading completed hmm graph from %s"%hmm_graph_fname) hmm_graph_ = pickle.load(open(hmm_graph_fname,'r')) tic("done reading completed hmm graph from %s"%hmm_graph_fname) return hmm_graph_ # The TT gpaph # Also stored on disk as pickle by save_ttg_values (when it is filled). global tt_graph_ tt_graph_ = None tt_graph_fname = "%s/tt_graph.pkl"%experiment_directory(experiment_name) def tt_graph(): global tt_graph_ if not tt_graph_: if not os.path.exists(tt_graph_fname): tic("creating empty tt graph", experiment_name) tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4) tt_graph_.checkInvariants() save_ttg_structure(tt_graph_, experiment_name=experiment_name) else: tic("reading tt graph from %s"%tt_graph_fname, experiment_name) tt_graph_ = pickle.load(open(tt_graph_fname,'r')) return tt_graph_ # The GMFR # Also stored on disk as pickle by save_gmrf_values (when it is filled). global gmrf_ gmrf_ = None gmrf_fname = "%s/gmrf.pkl"%experiment_directory(experiment_name) def gmrf(): global gmrf_ if not gmrf_: if not os.path.exists(gmrf_fname): tic("creating empty gmrf", experiment_name) gmrf_ = emptyValues(tt_graph()) tic("created empty gmrf", experiment_name) else: tic("reading gmrf from %s"%gmrf_fname, experiment_name) gmrf_ = pickle.load(open(gmrf_fname,'r')) tic("done reading gmrf from %s"%gmrf_fname, experiment_name) return gmrf_ tic("TT graph building", experiment_name) tic("Loaded TT graph = {0} edges, {1} variables".format(tt_graph().n, tt_graph().m), experiment_name) tic("simulating sstat building", experiment_name) gmrf_learning = experiment_design['gmrf_learning'] for var in tt_graph().allVariables(): var_id = var.varID var.parameters = GaussianParameters(0.0, 1.0) tt_graph().variable_counts[var_id] = 1 for key in tt_graph().edges.keys(): tt_graph().edges[key] = -.1 tt_graph().edge_counts[key] = 1 tic("done simulating sstat building", experiment_name) if not os.path.exists(tt_graph_fname): tic("saving tt graph pickle", experiment_name) pickle.dump(tt_graph(),open(tt_graph_fname,'w')) tic("done saving tt graph pickle", experiment_name) tic("GMRF learning", experiment_name) gmrf_learning = experiment_design['gmrf_learning'] gmrf_learning_params = gmrf_learning['parameters'] gmrf_ = gmrf_learn(tt_graph(), gmrf_learning['process'], experiment_name, gmrf_learning_params) tic("Done GMRF learning", experiment_name) tic("saving gmrf pickle", experiment_name) pickle.dump(gmrf_,open(gmrf_fname,'w')) save_gmrf_values(gmrf_, experiment_name=experiment_name) tic("done saving gmrf pickle", experiment_name) tic("GMRF estimation",experiment_name) gmrf_estimation = experiment_design['gmrf_estimation'] gmrf_estimation_parameters = gmrf_estimation['parameters'] # Saves all the GMRF estimators in the different formats # Will be reloaded when we do the estimation gmrf_est(gmrf(), gmrf_estimation['process'], experiment_name, gmrf_estimation_parameters) tic("End of learning", experiment_name)
def learn_procedure(experiment_design,num_jobs=1): experiment_name = experiment_design['name'] # Get the network basic_geometry = experiment_design['basic_geometry'] # Nearly everything will need a network. net = get_network(**basic_geometry) tic("Loaded network = {0} links".format(len(net)), experiment_name) graph_type = experiment_design['graph_type'] traj_conv_param = experiment_design['trajectory_conversion']['params'] # Trajectory conversion # Needed early because it gives the number of modes. global traj_conv_ traj_conv_ = None def traj_conv(): global traj_conv_ if not traj_conv_: traj_conv_ = createTrajectoryConversion(graph_type=graph_type, process=experiment_design['trajectory_conversion']['process'], params=traj_conv_param, network=net, max_nb_mixture=traj_conv_param['max_n_modes'], n_jobs=num_jobs) return traj_conv_ # Number of modes # Also stored on disk as pickle global mode_counts_ mode_counts_ = None def mode_counts(): global mode_counts_ if not mode_counts_: tic("Loading trajectory conversion...") fname = "%s/mode_count.pkl"%experiment_directory(experiment_name) if not os.path.exists(fname): pickle.dump(traj_conv().modeCounts(), open(fname,'w')) mode_counts_ = pickle.load(open(fname,'r')) tic("Done loading trajectory conversion and mode counts") return mode_counts_ # The HMM graph global hmm_graph_ hmm_graph_ = None hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name) def hmm_graph(): global hmm_graph_ if hmm_graph_ is None: if not os.path.exists(hmm_graph_fname): if graph_type == 'simple': hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts()) else: # Complex model not implemented assert False else: tic("Reading completed hmm graph from %s"%hmm_graph_fname) hmm_graph_ = pickle.load(open(hmm_graph_fname,'r')) return hmm_graph_ # The TT gpaph # Also stored on disk as pickle by save_ttg_values (when it is filled). global tt_graph_ tt_graph_ = None tt_graph_fname = "%s/tt_graph.pkl"%experiment_directory(experiment_name) def tt_graph(): global tt_graph_ if not tt_graph_: if not os.path.exists(tt_graph_fname): tic("creating empty tt graph", experiment_name) tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4) tt_graph_.checkInvariants() save_ttg_structure(tt_graph_, experiment_name=experiment_name) else: tic("reading tt graph from %s"%tt_graph_fname, experiment_name) tt_graph_ = pickle.load(open(tt_graph_fname,'r')) return tt_graph_ # The GMFR # Also stored on disk as pickle by save_gmrf_values (when it is filled). global gmrf_ gmrf_ = None gmrf_fname = "%s/gmrf.pkl"%experiment_directory(experiment_name) def gmrf(): global gmrf_ if not gmrf_: if not os.path.exists(gmrf_fname): tic("creating empty gmrf", experiment_name) gmrf_ = emptyValues(tt_graph()) else: tic("reading gmrf from %s"%gmrf_fname, experiment_name) gmrf_ = pickle.load(open(gmrf_fname,'r')) return gmrf_ # The experiments data: data_source = experiment_design['data_source'] dates = data_source['dates'] basic_geometry = experiment_design['basic_geometry'] # All this is lazy. Calling these functions does not create data. def tspots_seqs(): return (ttob_seq for date in dates for ttob_seq in getDayTSpots(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], net)) def traj_obs(print_num=1000): """ Returns the trajectory observations. If the obs have never been computed before, also stores them in a file. Otherwise reads the cached copy from the disk. """ fname = "%s/traj_obs.pkl"%experiment_directory(experiment_name) fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name) if not os.path.exists(fname): tic("traj_obs: Saving trajectory obs in %s"%fname, experiment_name) if num_jobs == 1: seq = (traj_ob for date in dates for traj_ob in getDayTrajs(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], experiment_design['trajectory_conversion'], traj_conv(), net)) else: from joblib import Parallel, delayed tic("Using concurrent job code with {0} jobs".format(num_jobs),"learn_procedure") ls = Parallel(n_jobs=num_jobs)(delayed(wrapper)(data_source['feed'], basic_geometry['nid'], date, basic_geometry['net_type'], basic_geometry['box'], experiment_design['trajectory_conversion'], traj_conv(), net) for date in dates) seq = [traj_ob for l in ls for traj_ob in l] # seq = (traj_ob for tspots_seq in tspots_seqs() # for traj_ob in traj_conv().mapTrajectory(tspots_seq)) kfold_cross_validation = data_source['kfold_cross_validation'] test_k = data_source['test_k'] assert kfold_cross_validation == 0 or test_k < kfold_cross_validation f = open(fname, 'w') if kfold_cross_validation > 0: tic("traj_obs: Saving test trajectory obs in %s"%fname_test, experiment_name) f_test = open(fname_test, 'w') idx = 0 for traj_ob in seq: idx += 1 if print_num > 0 and idx % print_num == 0: tic("traj_obs: Converted so far {0} observations".format(idx), experiment_name) if kfold_cross_validation > 0 and idx % kfold_cross_validation == test_k: s_dump_elt(traj_ob, f_test) else: s_dump_elt(traj_ob, f) yield traj_ob else: tic("traj_obs: opening trajectory obs in %s"%fname, experiment_name) f = open(fname, 'r') for traj_ob in s_load(f): yield traj_ob def var_seqs(): return ([obs.varId for obs in traj_ob.observations] for traj_ob in traj_obs()) # Starting learning here tic("HMM learning",experiment_name) tic("Loaded HMM = {0} nodes, {1} transitions".format(len(hmm_graph().allNodes()), len(hmm_graph().allTransitions())), experiment_name) fillProbabilitiesObservations(hmm_graph(), var_seqs(), **experiment_design['hmm_learning']['parameters']) # Save to disk as well pickle.dump(hmm_graph(),open(hmm_graph_fname,'w')) save_hmm(hmm_graph(),experiment_name) tic("TT graph building", experiment_name) tic("Loaded TT graph = {0} edges, {1} variables".format(tt_graph().n, tt_graph().m), experiment_name) gmrf_learning = experiment_design['gmrf_learning'] fillTTGraph(tt_graph(), traj_obs(),traj_obs_copy=traj_obs(),**gmrf_learning['tt_graph_parameters']) pickle.dump(tt_graph(),open(tt_graph_fname,'w')) tic("GMRF learning", experiment_name) gmrf_learning = experiment_design['gmrf_learning'] gmrf_learning_params = gmrf_learning['parameters'] gmrf_ = gmrf_learn(tt_graph(), gmrf_learning['process'], experiment_name, gmrf_learning_params) pickle.dump(gmrf_,open(gmrf_fname,'w')) save_gmrf_values(gmrf(), experiment_name=experiment_name) tic("GMRF estimation",experiment_name) gmrf_estimation = experiment_design['gmrf_estimation'] gmrf_estimation_parameters = gmrf_estimation['parameters'] # Saves all the GMRF estimators in the different formats # Will be reloaded when we do the estimation gmrf_est(gmrf(), gmrf_estimation['process'], experiment_name, gmrf_estimation_parameters) tic("End of learning", experiment_name)
def hmm_graph(): global hmm_graph_ if hmm_graph_ is None: if not os.path.exists(hmm_graph_fname): if graph_type == 'simple': tic("creating empty hmm graph", experiment_name) hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts()) tic("done creating empty hmm graph", experiment_name) tic("saving hmm graph pickle", experiment_name) pickle.dump(hmm_graph(),open(hmm_graph_fname,'w')) tic("done saving hmm graph pickle", experiment_name) else: # Complex model not implemented assert False else: tic("Reading completed hmm graph from %s"%hmm_graph_fname) hmm_graph_ = pickle.load(open(hmm_graph_fname,'r')) tic("done reading completed hmm graph from %s"%hmm_graph_fname) return hmm_graph_
def read_hmm_pickle(experiment_name): hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name) tic("Reading hmm from %s"%hmm_graph_fname, experiment_name) return pickle.load(open(hmm_graph_fname,'r'))
def fillTTGraph(tt_graph, traj_obs, min_variance=1e-2, variance_prior=0.0, variance_prior_count=0.0, traj_obs_copy=None): """ Computes the sufficient statistics of the travel time graph, and fills the corresponding TT graph. Arguments: tt_graph -- a travel time graph traj_obs -- an iterable of trajectory observations TODO: returns some stats about the number of elements seen """ # This function is one of the most complicated # Compute the sufficient stats # First moments for means # The prior is on white noise sstats0 = dict([(var_id, 0.0) for var_id in tt_graph.variable_keys]) sstats0_counts = dict([(var_id, variance_prior_count) for var_id in tt_graph.variable_keys]) sstats0_true_counts = dict([(var_id, 0) for var_id in tt_graph.variable_keys]) sstats1_var = dict([(var_id, variance_prior_count*variance_prior) for var_id in tt_graph.variable_keys]) # Compute the sufficient statistics first for central elements tic_n_obs = 1000 count_traj_obs = 0 for traj_ob in traj_obs: count_traj_obs += 1 if count_traj_obs % tic_n_obs == 0: tic("fillTTGraph: processed %d observations in pass #1"%count_traj_obs) obs = traj_ob.observations for ob in obs: sstats0[ob.varId] += ob.value sstats0_counts[ob.varId] += 1.0 sstats0_true_counts[ob.varId] += 1 sstats1_var[ob.varId] += ob.value * ob.value del obs,ob tic("fillTTGraph: processed %d observations in pass #1"%count_traj_obs) # Compute the means means = {} for var in tt_graph.allVariables(): var_id = var.varID count0 = sstats0_counts[var_id] mean = sstats0[var_id] / float(count0) if count0 > 0 else 0.0 means[var_id] = mean assert mean >= -EPSI # Specific to traffic problem mean = max(mean,0) del mean, count0,var,var_id # Compute the variances variances = {} for var in tt_graph.allVariables(): var_id = var.varID count0 = sstats0_counts[var_id] sstat1 = sstats1_var[var_id] / float(count0) if count0 > 0 else 0.0 mean = means[var_id] variance_ = sstat1 - mean * mean assert variance_ >= -EPSI, (sstat1 - mean * mean, sstat1, mean) variance = max(variance_, min_variance) variances[var_id] = variance del var,count0,sstat1,variance_,variance,var_id # Update the gaussian parameters for var in tt_graph.allVariables(): var_id = var.varID mean = means[var_id] variance = variances[var_id] var.parameters = GaussianParameters(mean, variance) tt_graph.variable_counts[var_id] = sstats0_true_counts[var_id] # Second moments for outer terms all_edge_keys = tt_graph.edges.keys() # Covariance term # pvid = pair of var_ids sstats1 = dict([(pvid, 0.0) for pvid in all_edge_keys]) # First order stat for the start sstats1_from = dict([(pvid, variance_prior * variance_prior_count) for pvid in all_edge_keys]) sstats1_to = dict([(pvid, variance_prior * variance_prior_count) for pvid in all_edge_keys]) sstats0_from = dict([(pvid, 0.0) for pvid in all_edge_keys]) sstats0_to = dict([(pvid, 0.0) for pvid in all_edge_keys]) sstats1_counts = dict([(pvid, variance_prior_count) for pvid in all_edge_keys]) sstats1_true_counts = dict([(pvid, 0) for pvid in all_edge_keys]) # Fill the sufficient stats for the variance and the mean: # Updates the sufficient stats # hack to make sure we can run twice over the data if traj_obs_copy is None: traj_obs_copy=traj_obs count_traj_obs = 0 for traj_ob in traj_obs_copy: count_traj_obs += 1 if count_traj_obs % tic_n_obs == 0: tic("fillTTGraph: processed %d observations in pass #2"%count_traj_obs) obs = traj_ob.observations l = len(obs) for i in range(l): for j in range(i + 1, l): from_ob = obs[i] to_ob = obs[j] from_vid = from_ob.varId to_vid = to_ob.varId assert not ((from_vid, to_vid) in sstats1 and (to_vid, from_vid) in sstats1) if (from_vid, to_vid) not in sstats1 and (to_vid, from_vid) not in sstats1: continue # We may need to flip around the vids and values because of the ordering in # the variables if (from_vid, to_vid) in sstats1: from_vid_c = from_vid to_vid_c = to_vid from_val_c = from_ob.value to_val_c = to_ob.value else: assert (to_vid, from_vid) in sstats1 from_vid_c = to_vid to_vid_c = from_vid from_val_c = to_ob.value to_val_c = from_ob.value # Update the stats key = (from_vid_c, to_vid_c) sstats1[key] += from_val_c * to_val_c sstats1_from[key] += from_val_c * from_val_c sstats1_to[key] += to_val_c * to_val_c sstats0_from[key] += from_val_c sstats0_to[key] += to_val_c sstats1_counts[key] += 1.0 sstats1_true_counts[key] += 1 tic("fillTTGraph: processed %d observations in pass #2"%count_traj_obs) # Compute the new covariance terms for key in all_edge_keys: count = sstats1_counts[key] # The local means local_mean_from = sstats0_from[key] / float(count) if count > 0 else 0.0 local_mean_to = sstats0_to[key] / float(count) if count > 0 else 0.0 # The local variances sstat1_from = sstats1_from[key] / float(count) if count > 0 else 0.0 sstat1_to = sstats1_to[key] / float(count) if count > 0 else 0.0 local_var_from_ = sstat1_from - (local_mean_from ** 2) local_var_to_ = sstat1_to - (local_mean_to ** 2) # This epsilon should prevent the assertion from failing due to rounding # errors and a low number of samples. local_var_from = max(local_var_from_, min_variance)+1e-7 local_var_to = max(local_var_to_, min_variance)+1e-7 # The local covariance term sstat1 = sstats1[key] / float(count) if count > 0 else 0.0 local_cov = sstat1 - local_mean_from * local_mean_to assert abs(local_cov) <= np.sqrt(local_var_from * local_var_to),() # The global variance terms (from_vid,to_vid) = key variance_from = variances[from_vid] variance_to = variances[to_vid] scale = np.sqrt((variance_from * variance_to) / (local_var_from * local_var_to)) cov = scale * local_cov assert np.abs(cov) <= np.sqrt(variance_from * variance_to)+EPSI, \ (np.abs(cov), np.sqrt(variance_from * variance_to)) tt_graph.edges[key] = cov tt_graph.edge_counts[key] = sstats1_true_counts[key] del from_vid,to_vid,count,sstat1,local_mean_from,local_mean_to,sstat1_from,sstat1_to
def learnMixtureAuto(tts, max_nb_mixtures): mixs = [learnMixture(tts, n) for n in range(1, min(len(tts) + 1, max_nb_mixtures + 1))] bics = [mix.bic(tts) for mix in mixs] best_idx = np.argmin(bics) tic("{0}:{1}".format(len(tts),best_idx+1),"learnMixtureAuto") return mixs[best_idx]