def traj_obs(print_num=1000):
    """ Returns the trajectory observations.
    
    If the obs have never been computed before, also stores them in a file.
    Otherwise reads the cached copy from the disk.
    """
    fname = "%s/traj_obs.pkl"%experiment_directory(experiment_name)
    fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name)
    if not os.path.exists(fname):
      tic("traj_obs: Saving trajectory obs in %s"%fname, experiment_name)
      if num_jobs == 1:
        seq = (traj_ob for date in dates
                       for traj_ob in getDayTrajs(data_source['feed'],
                                                      basic_geometry['nid'],
                                                      date,
                                                      basic_geometry['net_type'],
                                                      basic_geometry['box'],
                                                      experiment_design['trajectory_conversion'],
                                                      traj_conv(), net))
      else:
        from joblib import Parallel, delayed
        tic("Using concurrent job code with {0} jobs".format(num_jobs),"learn_procedure")
        ls = Parallel(n_jobs=num_jobs)(delayed(wrapper)(data_source['feed'],
                      basic_geometry['nid'],
                      date,
                      basic_geometry['net_type'],
                      basic_geometry['box'],
                      experiment_design['trajectory_conversion'],
                      traj_conv(), net) for date in dates)
        seq = [traj_ob for l in ls
                       for traj_ob in l]

#      seq = (traj_ob for tspots_seq in tspots_seqs()
#                      for traj_ob in traj_conv().mapTrajectory(tspots_seq))
      kfold_cross_validation = data_source['kfold_cross_validation']
      test_k = data_source['test_k']
      assert kfold_cross_validation == 0 or test_k < kfold_cross_validation
      f = open(fname, 'w')
      if kfold_cross_validation > 0:
        tic("traj_obs: Saving test trajectory obs in %s"%fname_test, experiment_name)
        f_test = open(fname_test, 'w')
      idx = 0
      for traj_ob in seq:
        idx += 1
        if print_num > 0 and idx % print_num == 0:
          tic("traj_obs: Converted so far {0} observations".format(idx), experiment_name)
        if kfold_cross_validation > 0 and idx % kfold_cross_validation == test_k:
          s_dump_elt(traj_ob, f_test)
        else:
          s_dump_elt(traj_ob, f)
        yield traj_ob
    else:
      tic("traj_obs: opening trajectory obs in %s"%fname, experiment_name)
      f = open(fname, 'r')
      for traj_ob in s_load(f):
        yield traj_ob
def test_traj_obs(experiment_name,print_counter=1000):
  fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name)
  tic("test_traj_obs: opening test trajectory obs in %s"%fname_test, experiment_name)
  f = open(fname_test, 'r')
  c = 0
  for traj_ob in s_load(f):
    c += 1
    if print_counter > 0 and c % print_counter == 0:
      tic("test_traj_obs: Consumed so far {0} observations".format(c), experiment_name)
    yield traj_ob
 def gmrf():
   global gmrf_
   if not gmrf_:
     if not os.path.exists(gmrf_fname):
       tic("creating empty gmrf", experiment_name)
       gmrf_ = emptyValues(tt_graph())
     else:
       tic("reading gmrf from %s"%gmrf_fname, experiment_name)
       gmrf_ = pickle.load(open(gmrf_fname,'r'))
   return gmrf_
 def mode_counts():
   global mode_counts_
   if not mode_counts_:
     tic("Loading trajectory conversion...")
     fname = "%s/mode_count.pkl"%experiment_directory(experiment_name)
     if not os.path.exists(fname):
       pickle.dump(traj_conv().modeCounts(), open(fname,'w'))
     mode_counts_ = pickle.load(open(fname,'r'))
     tic("Done loading trajectory conversion and mode counts")
   return mode_counts_
 def tt_graph():
   global tt_graph_
   if not tt_graph_:
     if not os.path.exists(tt_graph_fname):
       tic("creating empty tt graph", experiment_name)
       tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4)
       tt_graph_.checkInvariants()
       save_ttg_structure(tt_graph_, experiment_name=experiment_name)
     else:
       tic("reading tt graph from %s"%tt_graph_fname, experiment_name)
       tt_graph_ = pickle.load(open(tt_graph_fname,'r'))
   return tt_graph_
Exemple #6
0
def covsel_cvx_cholmod(
    R, U, rows, cols, k, psd_tolerance=1e-6, factor=None, num_iterations=500, finish_early=True, debug=True
):
    if debug:
        tic("smallest ev", "covsel_cvx_cholmod")
    min_ei = smallest_ev_arpack(R, U, rows, cols)
    if debug:
        tic("min_ei is %f" % min_ei, "covsel_cvx_cholmod")
    if min_ei < 0:
        R0 = R - min_ei + 1e-3
    else:
        R0 = R
    return run_cvx_cholmod(R0, U, rows, cols, k, psd_tolerance, factor, num_iterations, finish_early)
 def hmm_graph():
   global hmm_graph_
   if hmm_graph_ is None:
     if not os.path.exists(hmm_graph_fname):        
       if graph_type == 'simple':
         hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts())
       else:
         # Complex model not implemented
         assert False
     else:
       tic("Reading completed hmm graph from %s"%hmm_graph_fname)
       hmm_graph_ = pickle.load(open(hmm_graph_fname,'r'))
   return hmm_graph_    
def validation_procedure(experiment_design,
                         experiment_design_indep,
                         experiment_design_one_mode,
                         experiment_design_one_mode_indep,
                         validate_on_network=True):
  # Get the validation data
  experiment_name = experiment_design['name']
  # We will load the test data from this experiment
  experiment_name_one_mode = experiment_design_one_mode['name']
  # Get the network
  basic_geometry = experiment_design['basic_geometry']
  # Nearly everything will need a network.
  net = get_network(**basic_geometry)

  # reload the HMM and the GMRF estimator from the files
  # All we need for testing is a experiment_design
  test_hmm = read_hmm_pickle(experiment_name)
  # Read the estimator
  gmrf_estimation = experiment_design['gmrf_estimation']
  test_gmrf_estimator = get_gmrf_estimator(experiment_name, gmrf_estimation['process'])

  # Baseline Gaussian independent
  test_hmm_one_mode = read_hmm_pickle('{0}_one_mode'.format(experiment_name))
  test_gmrf_one_mode_indep = get_gmrf_estimator('{0}_one_mode_indep'.format(experiment_name), 'diagonal')

  # Baseline Gaussian
  test_gmrf_one_mode = get_gmrf_estimator('{0}_one_mode'.format(experiment_name), gmrf_estimation['process'])

  # Baseline MultiMode Gaussian independent
  test_gmrf_indep = get_gmrf_estimator('{0}_indep'.format(experiment_name), 'diagonal')

  tic('Validation')
  test_traj_obs_all = list(test_traj_obs(experiment_name))
  test_traj_obs_one_mode_all = list(test_traj_obs(experiment_name_one_mode))
  tic("Validation set: {0} trajectories".format(len(test_traj_obs_all)))
  model = [(test_traj_obs_all, test_gmrf_estimator, test_hmm, 'MM-GMRF')]
  baseline1 = [(test_traj_obs_one_mode_all, test_gmrf_one_mode_indep, test_hmm_one_mode, 'one mode indep')]
  baseline2 = [(test_traj_obs_one_mode_all, test_gmrf_one_mode, test_hmm_one_mode, 'one mode')]
  baseline3 = [(test_traj_obs_all, test_gmrf_indep, test_hmm, 'multi-modal indep')]
  val_model = model + baseline1 + baseline2 + baseline3
  tic('path validation')
  validate.validate_on_paths(val_model, net, 
                             estimation_sampling_process=experiment_design['estimation_sampling']['process'],
                             estimation_sampling_parameters=experiment_design['estimation_sampling']['parameters'], **experiment_design['evaluation'])
  if validate_on_network:
    tic('network validation')
    validate.validate_on_network(val_model, net, 
                               estimation_sampling_process=experiment_design['estimation_sampling']['process'],
                               estimation_sampling_parameters=experiment_design['estimation_sampling']['parameters'], **experiment_design['evaluation'])
  tic("Evaluation finished")
def getDayTSpots(date, network):
  """ Returns a list of sequences of TSpot objects for this day.
  """
  all_traj_fns = list_traj_filenames(date)
  tspots_groups = []
  for fname in all_traj_fns:
    # pylint:disable=W0142
    try:
      tspots = read_trajectory(fname)
    except IOError:
      tic("Could not read trajectory: {0}".format(fname), "getDayTSpots")
      tspots = []
    # tspots is a list of TSpot
    # Make sure we only have data for our sub network.
    for net_tspots in filterOutsideNetwork(tspots, network):
      tspots_groups.append(net_tspots)
  return tspots_groups
def fillTrajectoryCache(graph_type,basic_geometry,data_source,traj_conv_description,n_jobs=1):
  net = get_network(**basic_geometry)
  tic("Loaded network = {0} links".format(len(net)), "fillTrajectoryCache")
  traj_conv = createTrajectoryConversion(graph_type=graph_type,
                                                process=traj_conv_description['process'],
                                                params=traj_conv_description['params'],
                                                network=net,
                                                max_nb_mixture=traj_conv_description['params']['max_n_modes'],
                                                n_jobs=n_jobs)
  dates = data_source['dates']
  from joblib import Parallel, delayed
  Parallel(n_jobs=n_jobs)(delayed(wrapper)(data_source['feed'],
                basic_geometry['nid'],
                date,
                basic_geometry['net_type'],
                basic_geometry['box'],
                traj_conv_description,
                traj_conv, net) for date in dates)
Exemple #11
0
def gmrf_learn_cov_cholmod(
    R,
    U,
    rows,
    cols,
    edge_count,
    k,
    min_variance=1e-2,
    min_edge_count=10,
    num_iterations=50,
    psd_tolerance=1e-3,
    finish_early=True,
):
    n = len(R)
    m = len(U)
    mask = edge_count >= min_edge_count
    active_m = np.sum(mask)
    tic("m={0}, active m={1}".format(m, active_m), "gmrf_learn_cov_cholmod")
    active_U = U[mask]
    active_rows = rows[mask]
    active_cols = cols[mask]
    # A number of variables hare independant (due to lack of observations)
    independent_mask = independent_variables(n, active_rows, active_cols)
    # Put them aside and use the independent strategy to solve them.
    indep_idxs = np.arange(n)[independent_mask]
    R_indep = R[indep_idxs]
    # Solve the regularized version for independent variables
    D_indep = 1.0 / np.maximum(min_variance * np.ones_like(R_indep), R_indep)
    # Putting together the dependent and independent parts
    D = np.zeros_like(R)
    D[independent_mask] = D_indep
    P = np.zeros_like(U)
    # No need to solve for the outer diagonal terms, they are all zeros.
    # Solve for the dependent terms
    dependent_mask = ~independent_mask
    n_dep = np.sum(dependent_mask)
    if n_dep > 0:
        idxs_dep = np.arange(n)[dependent_mask]
        reverse_idxs_dep = np.zeros(n, dtype=np.int64)
        reverse_idxs_dep[dependent_mask] = np.arange(n_dep)
        rows_dep = reverse_idxs_dep[active_rows]
        cols_dep = reverse_idxs_dep[active_cols]
        R_dep = R[idxs_dep]
        U_dep = active_U
        (M, R_hat, U_hat) = normalized_problem(R_dep, U_dep, rows_dep, cols_dep)
        tic("Computing symbolic cholesky factorization of the graph...", "gmrf_learn_cov_cholmod")
        # Doing delayed import so that the rest of the code runs without sk-learn
        from scikits.sparse.cholmod import analyze

        Xs_dep = build_sparse(np.ones_like(R_hat), np.ones_like(U_hat), rows_dep, cols_dep)
        factor = analyze(Xs_dep)
        tic("Cholesky done", "gmrf_learn_cov_cholmod")
        # TODO add the other parameters
        (D_norm_dep, P_norm_dep) = covsel_cvx_cholmod(
            R_hat, U_hat, rows_dep, cols_dep, k, psd_tolerance, factor, num_iterations, finish_early
        )
        D[dependent_mask] = D_norm_dep / (M ** 2)
        P[mask] = P_norm_dep / (M[rows_dep] * M[cols_dep])
    return (D, P)
def getDayTTOBservations(date, network, print_stats=1000):
  """
  """
  all_trajs = list_traj_filenames(date)
  all_groups = []
  idx = 1
  for traj_index in all_trajs:
    idx += 1
    if print_stats > 0 and idx % print_stats == 0:
      tic("processed {0} observations".format(idx),"getDayTTOBservations")
    # pylint:disable=W0142
    try:
      tspots = read_trajectory(traj_index)
    except IOError:
      tic("ioerror when loading a trajectory {0}".format(traj_index), "getDayTTOBservations")
      tspots = []
    # Make sure we only have data for our sub network.
    for net_tspots in filterOutsideNetwork(tspots, network):
      groups = seqGroupBy(net_tspots, keyf=lambda tsp:tsp.spot.linkId)
      for g in completeGroups(groups, network):
        all_groups.append(g)
  return all_groups
def getMixtures(dates, network,
                max_n_links=None, return_tts=False, max_nb_mixture=4,num_threads=1):
  tic("Running with {0} jobs.".format(num_threads),"getMixtures")
  ttob_seqs = (ttob_seq for date in dates
               for ttob_seq in getDayTTOBservations(date, network))
  # Get travel times for each link
  all_ttobs = (ttob for ttob_seq in ttob_seqs for ttob in ttob_seq)
  tic("starting groupby...","getMixtures")
  all_ttobs_by_lid = sorted([(lid, list(vals)) for (lid, vals) in groupby(all_ttobs, lambda ttob:ttob.linkId)], key=lambda z:-len(z[1]))
  tic("groupby done, {0} links".format(len(all_ttobs_by_lid)),"getMixtures")
  if max_n_links:
    all_ttobs_by_lid = all_ttobs_by_lid[:max_n_links]
  tts_by_link = [(lid, np.array([tto.tt for tto in vals])) for (lid, vals) in all_ttobs_by_lid]
  tic("vectorization done","getMixtures")
  if num_threads != 1:
    tic("Running with {0} jobs.".format(num_threads),"getMixtures")
    learned_mixtures = Parallel(n_jobs=num_threads,verbose=10)(delayed(getMixtures_inner)((lid, tts, max_nb_mixture)) for (lid, tts) in tts_by_link)
  else:
    learned_mixtures = [(lid, learnMixtureAuto(tts, max_nb_mixture)) for (lid, tts) in tts_by_link]
  if return_tts:
    return (dict(learned_mixtures), dict(tts_by_link))
  else:
    return dict(learned_mixtures)
Exemple #14
0
def gmrf_learn_cov_cvx(R, U, rows, cols, edge_count, min_variance=1e-2, min_edge_count=10, num_iterations=50):
    n = len(R)
    m = len(U)
    mask = edge_count >= min_edge_count
    active_m = np.sum(mask)
    tic("m={0}, active m={1}".format(m, active_m), "gmrf_learn_cov_cvx")
    active_U = U[mask]
    active_rows = rows[mask]
    active_cols = cols[mask]
    # A number of variables hare independant (due to lack of observations)
    independent_mask = independent_variables(n, active_rows, active_cols)
    # Put them aside and use the independent strategy to solve them.
    indep_idxs = np.arange(n)[independent_mask]
    R_indep = R[indep_idxs]
    # Solve the regularized version for independent variables
    D_indep = 1.0 / np.maximum(min_variance * np.ones_like(R_indep), R_indep)
    # Putting together the dependent and independent parts
    D = np.zeros_like(R)
    D[independent_mask] = D_indep
    P = np.zeros_like(U)
    # No need to solve for the outer diagonal terms, they are all zeros.
    # Solve for the dependent terms
    dependent_mask = ~independent_mask
    n_dep = np.sum(dependent_mask)
    if n_dep > 0:
        idxs_dep = np.arange(n)[dependent_mask]
        reverse_idxs_dep = np.zeros(n, dtype=np.int64)
        reverse_idxs_dep[dependent_mask] = np.arange(n_dep)
        rows_dep = reverse_idxs_dep[active_rows]
        cols_dep = reverse_idxs_dep[active_cols]
        R_dep = R[idxs_dep]
        U_dep = active_U
        (M, R_hat, U_hat) = normalized_problem(R_dep, U_dep, rows_dep, cols_dep)
        (D_norm_dep, P_norm_dep) = covsel_cvx_dense(R_hat, U_hat, rows_dep, cols_dep, num_iterations=num_iterations)
        D[dependent_mask] = D_norm_dep / (M ** 2)
        P[mask] = P_norm_dep / (M[rows_dep] * M[cols_dep])
    return (D, P)
Exemple #15
0
def run_cvx_cholmod(
    R_hat, U_hat, rows, cols, k, psd_tolerance=1e-6, factor=None, num_iterations=500, finish_early=True, debug=True
):
    D = np.ones_like(R_hat)
    P = np.zeros_like(U_hat)
    for iters in range(num_iterations):
        tic("Iter={0}".format(iters), "run_cvx_cholmod")
        # Debug
        #    f1 = obj_dense(R_hat, U_hat, rows, cols, D, P)
        #    f2 = obj_cholmod(R_hat, U_hat, rows, cols, D, P, psd_tolerance, factor)
        #    delta = f1 - f2
        #    print "True objective value:",f1
        #    print "This objective value",f2
        #    print "Difference",delta
        # End debug
        z = iter_cholmod(R_hat, U_hat, rows, cols, D, P, k, psd_tolerance, factor)
        if finish_early and z is None:
            tic("done early", "run_cvx_cholmod")
            return (D, P)
        if z is not None:
            (D2, P2, fn, lsiter) = z
            D = D2
            P = P2
    return (D, P)
Exemple #16
0
def iter_cholmod(R_hat, U_hat, rows, cols, D, P, k, psd_tolerance=1e-6, factor=None, num_lsearch_iter=10):
    tic("computing gradient", "iter_cholmod")
    (g_D, g_P) = grad_cholmod(R_hat, U_hat, rows, cols, D, P, k, factor)
    v_D = -g_D
    v_P = -g_P
    # Debug
    #  (g_D_, g_P_) = grad_dense(R_hat, U_hat, rows, cols, D, P)
    #  print 'g_D diff:',la.norm(g_D-g_D_)
    #  print 'g_P diff:',la.norm(g_P-g_P_)
    # End debug
    # Stopping criterion:
    sqntdecr = -v_D.dot(g_D) - v_P.dot(g_P)
    tic("Newton decrement squared:%- 7.5e" % sqntdecr, "iter_cholmod")
    if sqntdecr < 1e-8:
        return None
    # line search
    dD = v_D
    dP = v_P
    s = 1.0
    f = obj_cholmod(R_hat, U_hat, rows, cols, D, P, psd_tolerance, factor)
    tic("Current objective value: {0}".format(f), "iter_cholmod")
    for lsiter in range(num_lsearch_iter):
        curr_D = D + s * dD
        curr_P = P + s * dP
        fn = obj_cholmod(R_hat, U_hat, rows, cols, curr_D, curr_P, psd_tolerance, factor)
        #    print 'fn ',fn
        tic("lsiter={0} fn={1}".format(lsiter, fn), "iter_cholmod")
        if fn == -np.Infinity:
            s *= 0.5
        else:
            if fn < f - 0.01 * s * sqntdecr:
                tic("Update lsiter={0}".format(lsiter), "iter_cholmod")
                return (curr_D, curr_P, fn, lsiter)
            s *= 0.5
    print "Too many iterations"
    return None
def wrapper(*args,**kwargs):
  tic("Wrapper called")
  res = getDayTrajs(*args, **kwargs)
  tic("Cached {0} trajs".format(len(res)))
  return res
def learning_perf(experiment_design):
  """ Starting the main procedure.
  
  This script tries to reuse as much as it can from the disk, to avoid expensive recomputations.
  """
  experiment_name = experiment_design['name']
  # Get the network
  basic_geometry = experiment_design['basic_geometry']
  # Nearly everything will need a network.
  net = get_network(**basic_geometry)
  tic("Loaded network = {0} links".format(len(net)), experiment_name)
  graph_type = experiment_design['graph_type']
  traj_conv_param = experiment_design['trajectory_conversion']['params']
  
  # Trajectory conversion
  # Needed early because it gives the number of modes.
  global traj_conv_
  traj_conv_ = None
  def traj_conv():
    global traj_conv_
    if not traj_conv_:
      traj_conv_ = createTrajectoryConversion(graph_type=graph_type,
                                                process=experiment_design['trajectory_conversion']['process'],
                                                params=traj_conv_param,
                                                network=net,
                                                max_nb_mixture=traj_conv_param['max_n_modes'])
    return traj_conv_
  
  # Number of modes
  # Also stored on disk as pickle
  global mode_counts_
  mode_counts_ = None
  def mode_counts():
    global mode_counts_
    if not mode_counts_:
      tic("Loading trajectory conversion...")
      fname = "%s/mode_count.pkl"%experiment_directory(experiment_name)
      if not os.path.exists(fname):
        pickle.dump(traj_conv().modeCounts(), open(fname,'w'))
      mode_counts_ = pickle.load(open(fname,'r'))
      tic("Done loading trajectory conversion and mode counts")
    return mode_counts_
  
  # The HMM graph
  global hmm_graph_
  hmm_graph_ = None
  hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name)
  def hmm_graph():
    global hmm_graph_
    if hmm_graph_ is None:
      if not os.path.exists(hmm_graph_fname):        
        if graph_type == 'simple':
          tic("creating empty hmm graph", experiment_name)
          hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts())
          tic("done creating empty hmm graph", experiment_name)
          tic("saving hmm graph pickle", experiment_name)
          pickle.dump(hmm_graph(),open(hmm_graph_fname,'w'))
          tic("done saving hmm graph pickle", experiment_name)
        else:
          # Complex model not implemented
          assert False
      else:
        tic("Reading completed hmm graph from %s"%hmm_graph_fname)
        hmm_graph_ = pickle.load(open(hmm_graph_fname,'r'))
        tic("done reading completed hmm graph from %s"%hmm_graph_fname)
    return hmm_graph_    
  
  # The TT gpaph
  # Also stored on disk as pickle by save_ttg_values (when it is filled).
  global tt_graph_
  tt_graph_ = None
  tt_graph_fname = "%s/tt_graph.pkl"%experiment_directory(experiment_name)
  def tt_graph():
    global tt_graph_
    if not tt_graph_:
      if not os.path.exists(tt_graph_fname):
        tic("creating empty tt graph", experiment_name)
        tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4)
        tt_graph_.checkInvariants()
        save_ttg_structure(tt_graph_, experiment_name=experiment_name)
      else:
        tic("reading tt graph from %s"%tt_graph_fname, experiment_name)
        tt_graph_ = pickle.load(open(tt_graph_fname,'r'))
    return tt_graph_
  
  # The GMFR
  # Also stored on disk as pickle by save_gmrf_values (when it is filled).
  global gmrf_
  gmrf_ = None
  gmrf_fname = "%s/gmrf.pkl"%experiment_directory(experiment_name)
  def gmrf():
    global gmrf_
    if not gmrf_:
      if not os.path.exists(gmrf_fname):
        tic("creating empty gmrf", experiment_name)
        gmrf_ = emptyValues(tt_graph())
        tic("created empty gmrf", experiment_name)
      else:
        tic("reading gmrf from %s"%gmrf_fname, experiment_name)
        gmrf_ = pickle.load(open(gmrf_fname,'r'))
        tic("done reading gmrf from %s"%gmrf_fname, experiment_name)
    return gmrf_
  
  
  tic("TT graph building", experiment_name)
  tic("Loaded TT graph = {0} edges, {1} variables".format(tt_graph().n,
                                                       tt_graph().m), experiment_name)
  tic("simulating sstat building", experiment_name)
  gmrf_learning = experiment_design['gmrf_learning']
  for var in tt_graph().allVariables():
    var_id = var.varID
    var.parameters = GaussianParameters(0.0, 1.0)
    tt_graph().variable_counts[var_id] = 1
  for key in tt_graph().edges.keys():
    tt_graph().edges[key] = -.1
    tt_graph().edge_counts[key] = 1
  tic("done simulating sstat building", experiment_name)
  
  if not os.path.exists(tt_graph_fname):
    tic("saving tt graph pickle", experiment_name)
    pickle.dump(tt_graph(),open(tt_graph_fname,'w'))
    tic("done saving tt graph pickle", experiment_name)
  
  tic("GMRF learning", experiment_name)
  gmrf_learning = experiment_design['gmrf_learning']
  gmrf_learning_params = gmrf_learning['parameters']
  gmrf_ = gmrf_learn(tt_graph(), gmrf_learning['process'],
                    experiment_name, gmrf_learning_params)
  tic("Done GMRF learning", experiment_name)
  
  tic("saving gmrf pickle", experiment_name)
  pickle.dump(gmrf_,open(gmrf_fname,'w'))
  save_gmrf_values(gmrf_, experiment_name=experiment_name)
  tic("done saving gmrf pickle", experiment_name)
  
  tic("GMRF estimation",experiment_name)
  gmrf_estimation = experiment_design['gmrf_estimation']
  gmrf_estimation_parameters = gmrf_estimation['parameters']
  # Saves all the GMRF estimators in the different formats
  # Will be reloaded when we do the estimation
  gmrf_est(gmrf(), gmrf_estimation['process'], experiment_name, gmrf_estimation_parameters)
  
  tic("End of learning", experiment_name)
def learn_procedure(experiment_design,num_jobs=1):
  experiment_name = experiment_design['name']
  # Get the network
  basic_geometry = experiment_design['basic_geometry']
  # Nearly everything will need a network.
  net = get_network(**basic_geometry)
  tic("Loaded network = {0} links".format(len(net)), experiment_name)
  graph_type = experiment_design['graph_type']
  traj_conv_param = experiment_design['trajectory_conversion']['params']
  
  # Trajectory conversion
  # Needed early because it gives the number of modes.
  global traj_conv_
  traj_conv_ = None
  def traj_conv():
    global traj_conv_
    if not traj_conv_:
      traj_conv_ = createTrajectoryConversion(graph_type=graph_type,
                                                process=experiment_design['trajectory_conversion']['process'],
                                                params=traj_conv_param,
                                                network=net,
                                                max_nb_mixture=traj_conv_param['max_n_modes'],
                                                n_jobs=num_jobs)
    return traj_conv_
  
  # Number of modes
  # Also stored on disk as pickle
  global mode_counts_
  mode_counts_ = None
  def mode_counts():
    global mode_counts_
    if not mode_counts_:
      tic("Loading trajectory conversion...")
      fname = "%s/mode_count.pkl"%experiment_directory(experiment_name)
      if not os.path.exists(fname):
        pickle.dump(traj_conv().modeCounts(), open(fname,'w'))
      mode_counts_ = pickle.load(open(fname,'r'))
      tic("Done loading trajectory conversion and mode counts")
    return mode_counts_
  
  # The HMM graph
  global hmm_graph_
  hmm_graph_ = None
  hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name)
  def hmm_graph():
    global hmm_graph_
    if hmm_graph_ is None:
      if not os.path.exists(hmm_graph_fname):        
        if graph_type == 'simple':
          hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts())
        else:
          # Complex model not implemented
          assert False
      else:
        tic("Reading completed hmm graph from %s"%hmm_graph_fname)
        hmm_graph_ = pickle.load(open(hmm_graph_fname,'r'))
    return hmm_graph_    
  
  # The TT gpaph
  # Also stored on disk as pickle by save_ttg_values (when it is filled).
  global tt_graph_
  tt_graph_ = None
  tt_graph_fname = "%s/tt_graph.pkl"%experiment_directory(experiment_name)
  def tt_graph():
    global tt_graph_
    if not tt_graph_:
      if not os.path.exists(tt_graph_fname):
        tic("creating empty tt graph", experiment_name)
        tt_graph_ = createTravelTimeGraph(hmm_graph(), radius=2e-4)
        tt_graph_.checkInvariants()
        save_ttg_structure(tt_graph_, experiment_name=experiment_name)
      else:
        tic("reading tt graph from %s"%tt_graph_fname, experiment_name)
        tt_graph_ = pickle.load(open(tt_graph_fname,'r'))
    return tt_graph_
  
  # The GMFR
  # Also stored on disk as pickle by save_gmrf_values (when it is filled).
  global gmrf_
  gmrf_ = None
  gmrf_fname = "%s/gmrf.pkl"%experiment_directory(experiment_name)
  def gmrf():
    global gmrf_
    if not gmrf_:
      if not os.path.exists(gmrf_fname):
        tic("creating empty gmrf", experiment_name)
        gmrf_ = emptyValues(tt_graph())
      else:
        tic("reading gmrf from %s"%gmrf_fname, experiment_name)
        gmrf_ = pickle.load(open(gmrf_fname,'r'))
    return gmrf_

  # The experiments data:
  data_source = experiment_design['data_source']
  dates = data_source['dates']
  basic_geometry = experiment_design['basic_geometry']
  
  # All this is lazy. Calling these functions does not create data.
  def tspots_seqs():
    return (ttob_seq for date in dates
               for ttob_seq in getDayTSpots(data_source['feed'],
                                                    basic_geometry['nid'],
                                                    date,
                                                    basic_geometry['net_type'],
                                                    basic_geometry['box'],
                                                    net))
  
  def traj_obs(print_num=1000):
    """ Returns the trajectory observations.
    
    If the obs have never been computed before, also stores them in a file.
    Otherwise reads the cached copy from the disk.
    """
    fname = "%s/traj_obs.pkl"%experiment_directory(experiment_name)
    fname_test = "%s/traj_obs_test.pkl"%experiment_directory(experiment_name)
    if not os.path.exists(fname):
      tic("traj_obs: Saving trajectory obs in %s"%fname, experiment_name)
      if num_jobs == 1:
        seq = (traj_ob for date in dates
                       for traj_ob in getDayTrajs(data_source['feed'],
                                                      basic_geometry['nid'],
                                                      date,
                                                      basic_geometry['net_type'],
                                                      basic_geometry['box'],
                                                      experiment_design['trajectory_conversion'],
                                                      traj_conv(), net))
      else:
        from joblib import Parallel, delayed
        tic("Using concurrent job code with {0} jobs".format(num_jobs),"learn_procedure")
        ls = Parallel(n_jobs=num_jobs)(delayed(wrapper)(data_source['feed'],
                      basic_geometry['nid'],
                      date,
                      basic_geometry['net_type'],
                      basic_geometry['box'],
                      experiment_design['trajectory_conversion'],
                      traj_conv(), net) for date in dates)
        seq = [traj_ob for l in ls
                       for traj_ob in l]

#      seq = (traj_ob for tspots_seq in tspots_seqs()
#                      for traj_ob in traj_conv().mapTrajectory(tspots_seq))
      kfold_cross_validation = data_source['kfold_cross_validation']
      test_k = data_source['test_k']
      assert kfold_cross_validation == 0 or test_k < kfold_cross_validation
      f = open(fname, 'w')
      if kfold_cross_validation > 0:
        tic("traj_obs: Saving test trajectory obs in %s"%fname_test, experiment_name)
        f_test = open(fname_test, 'w')
      idx = 0
      for traj_ob in seq:
        idx += 1
        if print_num > 0 and idx % print_num == 0:
          tic("traj_obs: Converted so far {0} observations".format(idx), experiment_name)
        if kfold_cross_validation > 0 and idx % kfold_cross_validation == test_k:
          s_dump_elt(traj_ob, f_test)
        else:
          s_dump_elt(traj_ob, f)
        yield traj_ob
    else:
      tic("traj_obs: opening trajectory obs in %s"%fname, experiment_name)
      f = open(fname, 'r')
      for traj_ob in s_load(f):
        yield traj_ob

  def var_seqs():
    return ([obs.varId for obs in traj_ob.observations] for traj_ob in traj_obs())

  # Starting learning here
  
  tic("HMM learning",experiment_name)
  tic("Loaded HMM = {0} nodes, {1} transitions".format(len(hmm_graph().allNodes()),
                                                       len(hmm_graph().allTransitions())), experiment_name)
  fillProbabilitiesObservations(hmm_graph(), var_seqs(), **experiment_design['hmm_learning']['parameters'])
  # Save to disk as well
  pickle.dump(hmm_graph(),open(hmm_graph_fname,'w'))
  save_hmm(hmm_graph(),experiment_name)
  
  tic("TT graph building", experiment_name)
  tic("Loaded TT graph = {0} edges, {1} variables".format(tt_graph().n,
                                                       tt_graph().m), experiment_name)
  gmrf_learning = experiment_design['gmrf_learning']
  fillTTGraph(tt_graph(), traj_obs(),traj_obs_copy=traj_obs(),**gmrf_learning['tt_graph_parameters'])
  pickle.dump(tt_graph(),open(tt_graph_fname,'w'))
  
  tic("GMRF learning", experiment_name)
  gmrf_learning = experiment_design['gmrf_learning']
  gmrf_learning_params = gmrf_learning['parameters']
  gmrf_ = gmrf_learn(tt_graph(), gmrf_learning['process'],
                    experiment_name, gmrf_learning_params)
  pickle.dump(gmrf_,open(gmrf_fname,'w'))
  save_gmrf_values(gmrf(), experiment_name=experiment_name)

  tic("GMRF estimation",experiment_name)
  gmrf_estimation = experiment_design['gmrf_estimation']
  gmrf_estimation_parameters = gmrf_estimation['parameters']
  # Saves all the GMRF estimators in the different formats
  # Will be reloaded when we do the estimation
  gmrf_est(gmrf(), gmrf_estimation['process'], experiment_name, gmrf_estimation_parameters)
  
  tic("End of learning", experiment_name)
 def hmm_graph():
   global hmm_graph_
   if hmm_graph_ is None:
     if not os.path.exists(hmm_graph_fname):        
       if graph_type == 'simple':
         tic("creating empty hmm graph", experiment_name)
         hmm_graph_ = model.createHMMGraphFromNetwork(net, mode_counts=mode_counts())
         tic("done creating empty hmm graph", experiment_name)
         tic("saving hmm graph pickle", experiment_name)
         pickle.dump(hmm_graph(),open(hmm_graph_fname,'w'))
         tic("done saving hmm graph pickle", experiment_name)
       else:
         # Complex model not implemented
         assert False
     else:
       tic("Reading completed hmm graph from %s"%hmm_graph_fname)
       hmm_graph_ = pickle.load(open(hmm_graph_fname,'r'))
       tic("done reading completed hmm graph from %s"%hmm_graph_fname)
   return hmm_graph_    
def read_hmm_pickle(experiment_name):
  hmm_graph_fname = "%s/hmm_graph.pkl"%experiment_directory(experiment_name)
  tic("Reading hmm from %s"%hmm_graph_fname, experiment_name)
  return pickle.load(open(hmm_graph_fname,'r'))
def fillTTGraph(tt_graph, traj_obs, min_variance=1e-2,
                variance_prior=0.0, variance_prior_count=0.0, traj_obs_copy=None):
  """ Computes the sufficient statistics of the travel time graph, and fills the
  corresponding TT graph.
  
  Arguments:
  tt_graph -- a travel time graph
  traj_obs -- an iterable of trajectory observations
  
  TODO: returns some stats about the number of elements seen
  """
  # This function is one of the most complicated
  # Compute the sufficient stats
  # First moments for means
  # The prior is on white noise
  sstats0 = dict([(var_id, 0.0) for var_id in tt_graph.variable_keys])
  sstats0_counts = dict([(var_id, variance_prior_count) for var_id in tt_graph.variable_keys])
  sstats0_true_counts = dict([(var_id, 0) for var_id in tt_graph.variable_keys])
  sstats1_var = dict([(var_id, variance_prior_count*variance_prior) for var_id in tt_graph.variable_keys])
  # Compute the sufficient statistics first for central elements
  tic_n_obs = 1000
  count_traj_obs = 0
  for traj_ob in traj_obs:
    count_traj_obs += 1
    if count_traj_obs % tic_n_obs == 0:
      tic("fillTTGraph: processed %d observations in pass #1"%count_traj_obs)
    obs = traj_ob.observations
    for ob in obs:
      sstats0[ob.varId] += ob.value
      sstats0_counts[ob.varId] += 1.0
      sstats0_true_counts[ob.varId] += 1
      sstats1_var[ob.varId] += ob.value * ob.value
    del obs,ob
  tic("fillTTGraph: processed %d observations in pass #1"%count_traj_obs)
  # Compute the means
  means = {}
  for var in tt_graph.allVariables():
    var_id = var.varID
    count0 = sstats0_counts[var_id]
    mean = sstats0[var_id] / float(count0) if count0 > 0 else 0.0
    means[var_id] = mean
    assert mean >= -EPSI # Specific to traffic problem
    mean = max(mean,0)
    del mean, count0,var,var_id
  # Compute the variances
  variances = {}
  for var in tt_graph.allVariables():
    var_id = var.varID
    count0 = sstats0_counts[var_id]
    sstat1 = sstats1_var[var_id] / float(count0) if count0 > 0 else 0.0
    mean = means[var_id]
    variance_ = sstat1 - mean * mean
    assert variance_ >= -EPSI, (sstat1 - mean * mean, sstat1, mean)
    variance = max(variance_, min_variance)
    variances[var_id] = variance
    del var,count0,sstat1,variance_,variance,var_id
  # Update the gaussian parameters
  for var in tt_graph.allVariables():
    var_id = var.varID
    mean = means[var_id]
    variance = variances[var_id]
    var.parameters = GaussianParameters(mean, variance)
    tt_graph.variable_counts[var_id] = sstats0_true_counts[var_id]
   
  # Second moments for outer terms
  all_edge_keys = tt_graph.edges.keys()
  # Covariance term
  # pvid = pair of var_ids
  sstats1 = dict([(pvid, 0.0) for pvid in all_edge_keys])
  # First order stat for the start
  sstats1_from = dict([(pvid, variance_prior * variance_prior_count) 
                   for pvid in all_edge_keys])
  sstats1_to = dict([(pvid, variance_prior * variance_prior_count) 
                   for pvid in all_edge_keys])
  sstats0_from = dict([(pvid, 0.0) for pvid in all_edge_keys])
  sstats0_to = dict([(pvid, 0.0) for pvid in all_edge_keys])
  sstats1_counts = dict([(pvid, variance_prior_count) for pvid in all_edge_keys])
  sstats1_true_counts = dict([(pvid, 0) for pvid in all_edge_keys])
  
  # Fill the sufficient stats for the variance and the mean:
  # Updates the sufficient stats
  # hack to make sure we can run twice over the data
  if traj_obs_copy is None:
    traj_obs_copy=traj_obs
  count_traj_obs = 0
  for traj_ob in traj_obs_copy:
    count_traj_obs += 1
    if count_traj_obs % tic_n_obs == 0:
      tic("fillTTGraph: processed %d observations in pass #2"%count_traj_obs)
    obs = traj_ob.observations
    l = len(obs)
    for i in range(l):
      for j in range(i + 1, l):
        from_ob = obs[i]
        to_ob = obs[j]
        from_vid = from_ob.varId
        to_vid = to_ob.varId
        assert not ((from_vid, to_vid) in sstats1 and (to_vid, from_vid) in sstats1)
        if (from_vid, to_vid) not in sstats1 and (to_vid, from_vid) not in sstats1:
          continue
        # We may need to flip around the vids and values because of the ordering in
        # the variables
        if (from_vid, to_vid) in sstats1:
          from_vid_c = from_vid
          to_vid_c = to_vid
          from_val_c = from_ob.value
          to_val_c = to_ob.value
        else:
          assert (to_vid, from_vid) in sstats1
          from_vid_c = to_vid
          to_vid_c = from_vid
          from_val_c = to_ob.value
          to_val_c = from_ob.value
        # Update the stats
        key = (from_vid_c, to_vid_c)
        sstats1[key] += from_val_c * to_val_c
        sstats1_from[key] += from_val_c * from_val_c
        sstats1_to[key] += to_val_c * to_val_c
        sstats0_from[key] += from_val_c
        sstats0_to[key] += to_val_c
        sstats1_counts[key] += 1.0
        sstats1_true_counts[key] += 1
  tic("fillTTGraph: processed %d observations in pass #2"%count_traj_obs)
  
  # Compute the new covariance terms
  for key in all_edge_keys:
    count = sstats1_counts[key]
    # The local means
    local_mean_from = sstats0_from[key] / float(count) if count > 0 else 0.0
    local_mean_to = sstats0_to[key] / float(count) if count > 0 else 0.0
    # The local variances
    sstat1_from = sstats1_from[key] / float(count) if count > 0 else 0.0
    sstat1_to = sstats1_to[key] / float(count) if count > 0 else 0.0
    local_var_from_ = sstat1_from - (local_mean_from ** 2)
    local_var_to_ = sstat1_to - (local_mean_to ** 2)
    # This epsilon should prevent the assertion from failing due to rounding
    # errors and a low number of samples.
    local_var_from = max(local_var_from_, min_variance)+1e-7
    local_var_to = max(local_var_to_, min_variance)+1e-7
    # The local covariance term
    sstat1 = sstats1[key] / float(count) if count > 0 else 0.0
    local_cov = sstat1 - local_mean_from * local_mean_to
    assert abs(local_cov) <= np.sqrt(local_var_from * local_var_to),()
    # The global variance terms
    (from_vid,to_vid) = key
    variance_from = variances[from_vid]
    variance_to = variances[to_vid]
    scale = np.sqrt((variance_from * variance_to) / (local_var_from * local_var_to))
    cov = scale * local_cov
    assert np.abs(cov) <= np.sqrt(variance_from * variance_to)+EPSI, \
      (np.abs(cov), np.sqrt(variance_from * variance_to))
    tt_graph.edges[key] = cov
    tt_graph.edge_counts[key] = sstats1_true_counts[key]
    del from_vid,to_vid,count,sstat1,local_mean_from,local_mean_to,sstat1_from,sstat1_to
def learnMixtureAuto(tts, max_nb_mixtures):
  mixs = [learnMixture(tts, n) for n in range(1, min(len(tts) + 1, max_nb_mixtures + 1))]
  bics = [mix.bic(tts) for mix in mixs]
  best_idx = np.argmin(bics)
  tic("{0}:{1}".format(len(tts),best_idx+1),"learnMixtureAuto")
  return mixs[best_idx]