def conditional_lateness_prediction_intervals(rows=None, degrees_sep=(1,5,10,20,50), alpha=0.05): if rows is None: print "Selecting..." cur = db.get_cursor() db.SQLExec(cur,""" select 30*(d1.lateness/30.0)::int as conditional, d2.stop_number-d1.stop_number as sepdegree, d2.lateness, trip_stop_weight from datamining_table d2 natural join trip_stop_weights inner join datamining_table d1 on d1.gps_segment_id = d2.gps_segment_id and d2.stop_number-d1.stop_number in (""" + \ ",".join(map(str,degrees_sep)) + """) and d2.lateness is not null and d1.lateness is not null """) print "Retrieving..." rows = cur.fetchall() cur.close() print len(rows),"rows retrieved." figure() sep_split = DM.split_on_attributes(('sepdegree',),rows) sds = array([k[0] for k in sep_split.keys()]) sds.sort() for i,sd in enumerate(reversed(sds)): sdrows = sep_split[(sd,)] cond_split = DM.split_on_attributes(('conditional',),sdrows) conds = array([k[0] for k in cond_split.keys()]) conds.sort() upper_preds = [] lower_preds = [] upup_preds = [] lolo_preds = [] for cond in conds: cond_rows = array([(r['lateness'],r['trip_stop_weight']) for r in cond_split[(cond,)]]) x,p,a_n = ecdf(cond_rows,weighted=True) (lower,upper),(lolo,hihi) = find_pred_interval(x,p,a_n,alpha=alpha) upper_preds.append(upper) lower_preds.append(lower) upup_preds.append(hihi) lolo_preds.append(lolo) #plot(conds,upper_preds,pcolors[i],label="D.o.S="+str(sd)) #plot(conds,lower_preds,pcolors[i],label=None) plot(conds,upup_preds,pcolors[i]+'+-',label="D.o.S="+str(sd)) plot(conds,lolo_preds,pcolors[i]+'+-',label=None) legend() xlabel("Conditional Lateness") ylabel("Lateness Prediction Interval") title("%d%% Prediction Intervals vs. Stop Separation, Prev Lateness"%(100*(1-alpha),))
def compare_route_portion(rows=None): """Compares lateness distributions between portions of the route""" if rows is None: cur = db.get_cursor() print "Selecting..." sql = """ select stop_number, total_num_stops, total_num_stops-stop_number as stops_before_end, (100*stop_number::numeric/total_num_stops)::int as route_portion, lateness, trip_stop_weight from datamining_table dm natural join trip_stop_weights natural join gps_segments inner join (select count(*) as total_num_stops, trip_id from gtf_stop_times group by trip_id) ns on ns.trip_id = dm.gtfs_trip_id where lateness is not null """ db.SQLExec(cur,sql) print "Retrieving..." rows = cur.fetchall() cur.close() print len(rows),'rows fetched.' # Plot ECDF comparisons stop_num_split = DM.split_on_attributes(('stop_number',),rows) end_num_split = DM.split_on_attributes(('stops_before_end',),rows) halfway_split = DM.split_on_attributes(('route_portion',),rows) cdf_dict = { "Second stop" : stop_num_split[(1,)], "Middle stop" : halfway_split[(50,)]+halfway_split[(51,)], "Next to last stop" : end_num_split[(1,)] } compare_ecdfs("Stop Position",cdf_dict); # Plot E vs stop number Es = [] moes = [] sns = array([k[0] for k in stop_num_split.keys()]) sns.sort() for sn in sns: rowdata = array([(r['lateness'],r['trip_stop_weight']) for r in stop_num_split[(sn,)]]) Eval,moe = E(rowdata,weighted=True) Es.append(Eval) moes.append(moe) Es = array(Es) moes = array(moes) figure() plot(sns,Es,'k-',label="Estimated expectation") plot(sns,Es+moes,'k--',label=None) plot(sns,Es-moes,'k--',label=None) #legend() xlabel("Stop Number") ylabel("Expected Latenes") title("Expected Lateness vs Stop Number")
def measure_slowness_correlation(rows=None): """ Measures the correlation of normalized travel time between sequential segments. """ if rows is None: rows = DM.get_joined_rows(prev_attrs=("meannorm_ssls as prev_stds",), degree_of_sep=1); max = rows.rowcount prev_stds = zeros(max) stds = zeros(max) n = 0 print "running manually..." for k,row in enumerate(rows): if k%(max/100) == 0: print "%5.2f%% (%d/%d) Done (n=%d)..."%(100*float(k)/max,k,max,n) prev_stds[n] = row['prev_stds'] stds[n] = row['meannorm_ssls'] if row['prev_stds'] is not None and row['meannorm_ssls'] is not None: n += 1 print "Calculating correlation..." Rpstds = R.FloatVector(prev_stds[:n]) Rstds = R.FloatVector(stds[:n]) corr = R.r['cor'](Rpstds, Rstds) rows.close() corr_ret = array(corr) return corr_ret,prev_stds,stds
def compare_ecdfs(attrsplit,rows, plot_CIs=False, plot_Es=False, plot_E_CIs=False, col_name='lateness', alpha=0.05): """ Given a set of rows to split, and the attributes on which to split them, compares distributions and expectations on a plot (with optional confidence intervals). If rows are already split into a dict, then just put the partitioning column for attrsplit. """ if isinstance(rows,dict): split = rows else: print "Splitting..." split = DM.split_on_attributes(attrsplit,rows) print "OK." figure() for i,key in enumerate(split.keys()): rows=array([(r[col_name],r['trip_stop_weight']) for r in split[key]]) x,p,a_n = ecdf(rows,weighted=True,alpha=alpha/len(split)) #bonferroni plot(x,p,pcolors[i],label=str(key)); print key,"ECDF MOE:",a_n if plot_CIs: plot(x,p-a_n,pcolors[i]+'--',label=None) plot(x,p+a_n,pcolors[i]+'--',label=None) E_bar,moe = E(rows,weighted=True,alpha=alpha/len(split)) #bonferroni print key,"E:",E_bar,", E MOE:",moe if plot_Es: plot((E_bar,E_bar),(0,1),pcolors[i],label=None) if plot_E_CIs: plot((E_bar-moe,E_bar-moe),(0,1),pcolors[i]+'--',label=None) plot((E_bar+moe,E_bar+moe),(0,1),pcolors[i]+'--',label=None) xlabel(col_name) ylabel("CDF("+col_name+")") title("ECDF of "+col_name+" partitioned by "+str(attrsplit))