def get_distances(type_day): data = get_mid_data(type_day) step = 96 # Wegstrecken in 2-dimensionaler Liste Speichern # ij beschreibt Weg von Zustand i nach Zustand j # initialisieren 5x5 Liste wege_ij = [[[[] for t in range(step)] for j in range(5)] for i in range(5)] for i in range(0, 5): for j in range(0, 5): for t in range(step): # filtern des Dataframes nach Ausgangs- und Zielzustandskombinationen filt = (data["Whyfrom"] == i) & (data["Whyto"] == j) & (data["Departure_t"] == t) # speichern der Liste der Distanzen zwischen den Zuständen in entsprechendem Feld wege_ij[i][j][t] = list(data[filt]["Distance"]) # ermitteln der absoluten Häufigkeiten der Wegstrecken wege_ij_count = [[[[] for t in range(step)] for i in range(5)] for j in range(5)] wege_ij_prob_dict = [[[{} for t in range(step)] for i in range(5)] for j in range(5)] for i in range(5): for j in range(5): for t in range(step): wege_ij_count[i][j][t] = Counter(wege_ij[i][j][t]) # umwandeln in relative Häufigkeiten und speichern in Dictionary (Wert : rel. Häufigkeit) for i in range(5): for j in range(5): for t in range(step): total = sum(wege_ij_count[i][j][t].values()) for key in wege_ij_count[i][j][t]: wege_ij_prob_dict[i][j][t][key] = wege_ij_count[i][j][t][key] / total # ersetze leere Dictionaries mit Mittelwerten aus den zwei umliegenden Dictionaries for i in range(5): for j in range(5): for t in range(step): if not wege_ij_prob_dict[i][j][t]: new = {} t_prior, t_next = t, t # Wenn vorheriges oder nachfolgendes Dictionary auch leer, wähle das darauffolgende while True: t_prior = t_prior - 1 if t_prior != 0 else step - 1 preceding = wege_ij_prob_dict[i][j][t_prior] if preceding: break while True: t_next = t_next + 1 if t_next != step - 1 else 0 succeeding = wege_ij_prob_dict[i][j][t_next] if succeeding: break for key in preceding: new[key] = preceding[key] / 2 for key in succeeding: if key in new: new[key] = new[key] + succeeding[key] / 2 else: new[key] = succeeding[key] / 2 wege_ij_prob_dict[i][j][t] = new save_params(type_day, "Zeitabhängige Wegstrecken", wege_ij_prob_dict)
def get_speed(type_day, zeitabhängig=True): if zeitabhängig: data = get_mid_data(type_day) pd.set_option('display.max_columns', None) data_grpd = [[] for i in range(12)] # 8 periodige Schritte -> parametrieren der Funktion in 2 stündigen Zeitintervallen steps = np.arange(0, 97, 8) for i in range(len(steps) - 1): filt = (data["Departure_t"] < steps[i + 1]) & (data["Departure_t"] >= steps[i]) data_grpd[i] = data[filt] def func(x, a, b): return a + b * np.log(x) def fit_plot_curve(data, start, end): av_speeds = [] for i in range(1, 150): # filtere auf alle i.xx Werte filt = (data["Distance"] - i > 0) & (data["Distance"] - i < 1) # ermittle den Median aller Geschwindigkeiten für das Distanzintervall av_speed = data[filt]["Av_speed"].median() if not np.isnan(av_speed): av_speeds.append((i, av_speed)) # x = Distanz, y = Geschwindigkeit x, y = zip(*av_speeds) # anpassen der Kurve an Funktionswerte popt, pcov = curve_fit(func, x, y) x_func = np.linspace(1, 150, 149) fitted_curve = [ func(x_val, *popt) for x_val in np.linspace(1, 150, 149) ] # plotten der Kurve plt.plot(x_func, fitted_curve, label="Intervall von {}, bis {} Uhr".format(start, end)) plt.xlabel("Distanz in km") plt.ylabel("Gewschwindigkeit in km/h") # untere Schranke der Geschwindigkeiten bestimmen lower_bound = data["Av_speed"].quantile(0.05) popt = np.append(popt, lower_bound) return popt plt.figure(figsize=(30, 20)) params = [[] for i in range(len(data_grpd))] for i, group in enumerate(data_grpd): params[i] = fit_plot_curve(group, i * 2, i * 2 + 2) plt.legend() # speichern der Ergebnisse save_params(type_day, "Zeitabhängige Geschwindigkeit", params) else: print("Noch nicht ergänzt")
def get_departure(type_day): data = get_mid_data(type_day) filt = data["Trip_no"] == 1 # Alle Abfahrtszeiten der ersten Trips des Tages first_trip = data[filt]["Departure"] # Dictionary mit "Zeitpunkt : Häufigkeit" first_trip_count = Counter(first_trip) # neues Dictionary mit "Zeitpunkt : rel. Häufigkeit" time_prob_dict = {} total = sum(first_trip_count.values()) for key in first_trip_count: time_prob_dict[key] = first_trip_count[key] / total # speichern des Ergebnisses save_params(type_day, "Abfahrtszeit", time_prob_dict)
def get_stopoverprobs(type_day): df = get_mid_data(type_day) filt = df["Whyto"] == 0 df_filt = df[filt] index_stopover = [] index_final = [] # Home-Trips aufteilen in Endstopps und Zwischenstopps for i in df_filt.index: if (i + 1 not in df.index) or (df.at[i + 1, "ID"] != df.at[i, "ID"]): index_final.append(i) else: index_stopover.append(i) df_final = df.iloc[index_final] df_stopover = df.iloc[index_stopover] # aufteilen der Trips nach den unterschiedlichen Zeitschritten trips_t_final = [0 for i in range(96)] trips_t_stopover = [0 for i in range(96)] # Wenn Fahrt mit Index i Endaufenthalt ist for i in df_final.index: # Abfahrtszeitintervall des Trips t = df_final.at[i, "Departure_t"] # erhöhe Zähler der Trips mit Endaufenthalt zum entsprechenden Zeitintervall trips_t_final[t] += 1 # Wenn Fahrt mit Index i Endaufenthalt ist for i in df_stopover.index: # Abfahrtszeitintervall des Trips t = df_stopover.at[i, "Departure_t"] # erhöhe Zähler der Trips mit Zwischenstopp zum entsprechenden Zeitintervall trips_t_stopover[t] += 1 for t in range(96): total = trips_t_final[t] + trips_t_stopover[t] if total: trips_t_final[t] = trips_t_final[t] / total trips_t_stopover[t] = trips_t_stopover[t] / total # Für den Fall, dass keine Trips in Zeitperiode vorhanden sind: # ermittle W'keiten über das Mittel aus vorherigen und kommenden Period else: total = trips_t_final[t - 1] + trips_t_stopover[t - 1] + \ trips_t_final[t + 1] + trips_t_stopover[t + 1] trips_t_final[t] = (trips_t_final[t - 1] + trips_t_final[t + 1]) / total trips_t_stopover[t] = (trips_t_stopover[t - 1] + trips_t_stopover[t + 1]) / total # speichern der Ergebnisse save_params(type_day, "Zwischenstoppwk", trips_t_final)
def update_all(): for type_day in range(1, 4): data = get_mid_data(type_day) get_stayduration(type_day) get_speed(type_day) get_departure(type_day) get_transition_probs(type_day) get_distances(type_day) states = calc_zustandsverteilung(data) save_params(type_day, "Zustandsverteilung", states) get_stopoverprobs(type_day)
def get_transition_probs(type_day): data = get_mid_data(type_day) states_grpd = [None for i in range(5)] for i in range(5): # filtern nach Ausgangszustand state = data[data["Whyfrom"] == i] # gruppieren nach Abfahrtszeitschritt states_grpd[i] = state.groupby(["Departure_t"]) tp_itj = [[[0 for j in range(5)] for t in range(96)] for i in range(5)] for i in range(5): for t, group in states_grpd[i]: # ermittle relative Häufigkeiten der Übergänge zu den anderen Zuständen # von Ausgangszustand i in Zeitschritt t counts = group["Whyto"].value_counts(normalize=True) # zuordnen der relativen Übergangshäufigkeiten zu entsprechenden Einträgen if counts.get(0): tp_itj[i][t][0] = counts.get(0) if counts.get(1): tp_itj[i][t][1] = counts.get(1) if counts.get(2): tp_itj[i][t][2] = counts.get(2) if counts.get(3): tp_itj[i][t][3] = counts.get(3) if counts.get(4): tp_itj[i][t][4] = counts.get(4) # ersetze fehlende Übergangswahrscheinlichkeiten durch Gleichverteilung (0.2) def replace_missing_probs(tp_itj): for t in range(96): for i in range(5): total = sum([tp_itj[i][t][j] for j in range(5)]) if total == 0: for j in range(5): tp_itj[i][t][j] = 0.2 replace_missing_probs(tp_itj) # speichern der Daten save_params(type_day, "Übergangswahrscheinlichkeiten", tp_itj)
def main(args): logging.info("loading data...") fake_train, fake_dev, fake_test = du.load_fake() true_train, true_dev, true_test = du.load_true() if args.debug: true_train = [true_train[0][:100]] fake_train = fake_train[:10] true_dev = true_dev[:100] fake_dev = fake_dev[:10] true_test = true_test[:100] fake_test = fake_test[:10] if args.rnn_type == 'gru': args.rnn = lasagne.layers.GRULayer elif args.rnn_type == 'lstm': args.rnn = lasagne.layers.LSTMLayer else: args.rnn = lasagne.layers.RecurrentLayer logging.info("building dictionary...") word_dict, char_dict = util.build_dict( None, max_words=0, dict_file=["word_dict", "char_dict"]) logging.info("creating embedding matrix...") word_embed = util.words2embedding(word_dict, 100, args.embedding_file) char_embed = util.char2embedding(char_dict, 30) (args.word_vocab_size, args.word_embed_size) = word_embed.shape (args.char_vocab_size, args.char_embed_size) = char_embed.shape logging.info("compiling Theano function...") att_fn, eval_fn, train_fn, params = \ tf.char_hierarchical_linguistic_fn(args, word_embed, char_embed, values=None) logging.info("batching examples...") # dev_examples = mb.doc_minibatch(fake_dev + true_dev, minibatch_size=args.batch_size, shuffle=False) dev_examples = mb.vec_minibatch(fake_dev + true_dev, word_dict, char_dict, args, False) # test_examples = mb.doc_minibatch(fake_test + true_test, args.batch_size, False) test_examples = mb.vec_minibatch(fake_test + true_test, word_dict, char_dict, args, False) train_examples = mb.train_doc_minibatch(fake_train, true_train, args, over_sample=True) logging.info("checking network...") # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args) dev_acc = evals.eval_vec_batch(eval_fn, dev_examples) print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) test_acc = evals.eval_vec_batch(eval_fn, test_examples) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) prev_fsc = 0 stop_count = 0 best_fsc = 0 best_acc = 0 logging.info("training %d examples" % len(train_examples)) start_time = time.time() n_updates = 0 for epoch in range(args.epoches): np.random.shuffle(train_examples) if epoch > 3: logging.info("compiling Theano function again...") args.learning_rate *= 0.9 att_fn, eval_fn, train_fn, params = \ tf.char_hierarchical_linguistic_fn(args, word_embed, char_embed, values=[x.get_value() for x in params]) for batch_x, _ in train_examples: batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x) batch_x = util.vectorization(list(batch_x), word_dict, char_dict, max_char_length=args.max_char) batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \ util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char) batch_sent = util.sent_ling_padding(list(batch_sent), args.max_sent, args.max_ling) batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling) batch_y = np.array(list(batch_y)) train_loss = train_fn(batch_rnn, batch_cnn, batch_word_mask, batch_sent_mask, batch_sent, batch_doc, batch_y) n_updates += 1 if n_updates % 100 == 0 and epoch > 6: logging.info( 'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' % (epoch, train_loss, time.time() - start_time)) # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args) dev_acc = evals.eval_vec_batch(eval_fn, dev_examples) logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) if dev_acc[3] >= best_fsc and dev_acc[0] > best_acc: best_fsc = dev_acc[3] best_acc = dev_acc[0] logging.info( 'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%' % (epoch, n_updates, dev_acc[3])) record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \ (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc # test_acc = evals.eval_batch(eval_fn, test_examples, word_dict, char_dict, args) test_acc = evals.eval_vec_batch(eval_fn, test_examples) print( 'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) if test_acc[3] > 91.4: util.save_params( 'char_hierarchical_rnn_params_%.2f_%.2f' % (dev_acc[3], test_acc[3]), params, epoch=epoch, n_updates=n_updates) if prev_fsc > dev_acc[3]: stop_count += 1 else: stop_count = 0 if stop_count == 6: print("stopped") prev_fsc = dev_acc[3] print(record) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
def get_stayduration(type_day): data = get_mid_data(type_day) aufenthalt_it = [[[] for i in range(96)] for i in range(5)] # speichern der Aufenthaltsdauern der einzelnen Zustände in den unterschiedlichen Zeitschritten for i in range(1, 5): for t in range(96): filt = (data["Whyto"] == i) & (data["Arrival_t"] == t) aufenthalt_it[i][t] = data[filt]["Stay_duration"] # für den Zustand Zuhause nur Aufenthaltsdauern der Zwischenstopps speichern index_stopover = [] for i in data[data["Whyto"] == 0].index: if (i + 1 not in data.index) or (data.at[i + 1, "ID"] != data.at[i, "ID"]): pass # nur Trips mit Ziel Zuhause abspeichern, worauf weitere Trips der Person folgen (Zwischenstopp) else: index_stopover.append(i) trips_stopover = data.iloc[index_stopover] for t in range(96): filt = trips_stopover["Arrival_t"] == t aufenthalt_it[0][t] = trips_stopover[filt]["Stay_duration"] # ermitteln der unterschiedlichen Aufenthaltsdauen und deren absoluten Häufigkeiten aufenthalt_counts = [[{} for i in range(96)] for i in range(5)] aufenthalt_val_prob = [[{} for i in range(96)] for i in range(5)] for i in range(5): for t in range(96): aufenthalt_counts[i][t] = Counter(aufenthalt_it[i][t]) # umrechnen in relative Häufigkeiten for i in range(5): for t in range(96): total = sum(aufenthalt_counts[i][t].values()) for key in aufenthalt_counts[i][t]: aufenthalt_val_prob[i][t][ key] = aufenthalt_counts[i][t][key] / total # ersetze leere Dictionaries mit Mittelwerten aus den zwei umliegenden Dictionaries for i in range(5): for t in range(96): if not aufenthalt_val_prob[i][t]: new = {} t_prior, t_next = t, t # Wenn vorheriges oder nachfolgendes Dictionary auch leer, wähle das darauffolgende while True: t_prior = t_prior - 1 if t_prior != 0 else 95 preceding = aufenthalt_val_prob[i][t_prior] if preceding: break while True: t_next = t_next + 1 if t_next != 95 else 0 succeeding = aufenthalt_val_prob[i][t_next] if succeeding: break for key in preceding: new[key] = preceding[key] / 2 for key in succeeding: if key in new: new[key] = new[key] + succeeding[key] / 2 else: new[key] = succeeding[key] / 2 aufenthalt_val_prob[i][t] = new # speichern des Ergebnisses save_params(type_day, "Zeitabhängige Aufenthaltsdauern", aufenthalt_val_prob)