def get_rows(rsubset=0.1):
    print "Selecting..."
    cur = db.get_cursor()
    if rsubset is not None:
        db.SQLExec(
            cur, """select * from datamining_table dm
               where random() < %f""" % (rsubset, ))
    else:
        db.SQLExec(cur, "select * from datamining_table")

    return cur
Beispiel #2
0
def criticalmass_compare_rows():
  """
  Selects and pairs rows from the same trip where one is from
  March 27 and the others are not (this is a many-to-one pairing),
  and subtracts their latenesses as the column 'ldiff'.

  Also selects all latenesses with a column 'is_cmass' indicating
  0 for not-cmass days and 1 for cmass days, for an ecdf comparison
  split.
  """

  print "Selecting..."
  cur = db.get_cursor()
  db.SQLExec(cur,
             """
select (dm_cm.lateness - dm.lateness) as ldiff, trip_stop_weight
from 
datamining_table dm_cm natural join trip_stop_weights
inner join gps_segments gs_cm on dm_cm.gps_segment_id=gs_cm.gps_segment_id
  and gs_cm.trip_date = '2009-03-27'
inner join datamining_table dm on dm.gtfs_trip_id=dm_cm.gtfs_trip_id
  and dm.stop_sequence=dm_cm.stop_sequence
inner join gps_segments gs on dm.gps_segment_id=gs.gps_segment_id
  and gs.trip_date != '2009-03-27'
where dm_cm.lateness is not null and dm.lateness is not null
and dm_cm.route_name in ('1','9','19')
             """);
  print "Retrieving..."
  diffrows=cur.fetchall();
  print len(diffrows),"rows retrieved."

  print "Selecting..."
  db.SQLExec(cur,
             """
select lateness, trip_stop_weight,
case when trip_date='2009-03-27' then 1
     else                             0
end as is_cmass
from datamining_table natural join gps_segments 
  natural join trip_stop_weights
where lateness is not null and service_id='1'
""");
  print "Retrieving..."
  rows = cur.fetchall();
  print len(rows),'rows retrieved.'
  cur.close();

  return diffrows,rows
Beispiel #3
0
def conditional_lateness_plots(rows=None,
                               degrees_sep=1,
                               conds=(0,60,300,600,1200)):
  """
  Plots the (weighted) conditional lateness distribution as
  
    F( lateness at stop | lateness at Dth stop previous )
    
  where D = degrees_sep. This is plotted as conditioned on each of
  the latenesses provided in conds.
  """

  if rows is None:
    cur = db.get_cursor();
    print "Selecting..."
    db.SQLExec(cur,"""select d1.lateness as cond,d2.lateness,trip_stop_weight 
from datamining_table d1 inner join datamining_table d2
  on d1.gps_segment_id=d2.gps_segment_id
  and d1.stop_number+%(deg_sep)s=d2.stop_number
  and d1.lateness in (""" + ",".join(map(str,conds)) + """)
  and d2.lateness is not null
inner join trip_stop_weights tsw on d2.gtfs_trip_id = tsw.gtfs_trip_id
  and d2.stop_id = tsw.stop_id
""",
               {'deg_sep':degrees_sep});
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),"rows retrieved."

  try:
    compare_ecdfs(('cond',),rows)
  except e:
    print e
  return rows
Beispiel #4
0
def conditional_lateness_gained(rows=None,
                               degrees_sep=1,
                               conds=(-6,0,6,60,120)):
  
  if rows is None:
    print "Selecting..."
    cur = db.get_cursor()
    db.SQLExec(cur,"""select d1.lateness_gained as cond,d2.lateness_gained,
trip_stop_weight 
from datamining_table d1 inner join datamining_table d2
  on d1.gps_segment_id=d2.gps_segment_id
  and d1.stop_number+%(deg_sep)s=d2.stop_number
  and d1.lateness_gained in (""" + ",".join(map(str,conds)) + """)
  and d2.lateness_gained is not null
inner join trip_stop_weights tsw on d2.gtfs_trip_id = tsw.gtfs_trip_id
  and d2.stop_id = tsw.stop_id
""",
               {'deg_sep':degrees_sep});
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),"rows retrieved."

  try:
    compare_ecdfs(('cond',),rows,col_name='lateness_gained',
                  plot_CIs=True,plot_Es=False,plot_E_CIs=False)
  except e:
    print e
  return rows
Beispiel #5
0
def conditional_lateness_prediction_intervals(rows=None,
                                              degrees_sep=(1,5,10,20,50),
                                              alpha=0.05):

  if rows is None:
    print "Selecting..."
    cur = db.get_cursor()
    db.SQLExec(cur,"""
select 30*(d1.lateness/30.0)::int as conditional, d2.stop_number-d1.stop_number as sepdegree,
  d2.lateness, trip_stop_weight
from datamining_table d2
natural join trip_stop_weights
inner join datamining_table d1
  on d1.gps_segment_id = d2.gps_segment_id
  and d2.stop_number-d1.stop_number in (""" + \
                 ",".join(map(str,degrees_sep)) + """)
  and d2.lateness is not null and d1.lateness is not null
""")
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),"rows retrieved."

  figure()

  sep_split = DM.split_on_attributes(('sepdegree',),rows)
  sds = array([k[0] for k in sep_split.keys()])
  sds.sort()
  for i,sd in enumerate(reversed(sds)):
    sdrows = sep_split[(sd,)]
    cond_split = DM.split_on_attributes(('conditional',),sdrows)
    conds = array([k[0] for k in cond_split.keys()])
    conds.sort()
    
    upper_preds = []
    lower_preds = []
    upup_preds = []
    lolo_preds = []
    for cond in conds:
      cond_rows = array([(r['lateness'],r['trip_stop_weight']) 
                         for r in cond_split[(cond,)]])
      x,p,a_n = ecdf(cond_rows,weighted=True)
      (lower,upper),(lolo,hihi) = find_pred_interval(x,p,a_n,alpha=alpha)
      upper_preds.append(upper)
      lower_preds.append(lower)
      

      upup_preds.append(hihi)
      lolo_preds.append(lolo)

    #plot(conds,upper_preds,pcolors[i],label="D.o.S="+str(sd))
    #plot(conds,lower_preds,pcolors[i],label=None)
    plot(conds,upup_preds,pcolors[i]+'+-',label="D.o.S="+str(sd))
    plot(conds,lolo_preds,pcolors[i]+'+-',label=None)
      
  legend()
  xlabel("Conditional Lateness")
  ylabel("Lateness Prediction Interval")
  title("%d%% Prediction Intervals vs. Stop Separation, Prev Lateness"%(100*(1-alpha),))
Beispiel #6
0
    def __init__(self, dbconn, autoFill=False, autoCommit=False):
        """
    Creates a ServiceDateHandler using the database from dbconn.
    If autoFill is True, then any missing service combinations
    are added to the database. If autoCommit is True, these
    changes will be committed immediately.
    """
        cur = db.get_cursor()

        ## Prepare calendar data

        db.SQLExec(
            cur, """select monday,tuesday,wednesday,thursday,friday,saturday,
                 sunday, service_id, start_date, end_date from gtf_calendar""")
        self.calendar_rows = cur.fetchall()
        db.SQLExec(cur, """select * from gtf_calendar_dates""")
        self.calendar_date_rows = cur.fetchall()

        ## Load existing combos

        db.SQLExec(
            cur, """select * from service_combinations 
                   order by combination_id, service_id""")
        service_combo_rows = cur.fetchall()

        self.combos = {}
        # map from combo_id to combo
        # note that combo lists are sorted by service_id
        for row in service_combo_rows:
            service_id, combo_id = row['service_id'], int(
                row['combination_id'])
            if not self.combos.has_key(combo_id):
                self.combos[combo_id] = []
            self.combos[combo_id].append(service_id)

        # map from combo to combo_id (reverse of self.combos)
        self.existing_combos = {}
        for combo_id in self.combos:
            self.existing_combos[tuple(self.combos[combo_id])] = int(combo_id)

        cur.close()

        ## Fill in missing combos

        if autoFill:
            self.fill_unique_service_combinations(dbconn, autoCommit)
Beispiel #7
0
def compare_route_portion(rows=None):
  """Compares lateness distributions between portions of the route"""

  if rows is None:
    cur = db.get_cursor()
    print "Selecting..."
    sql = """
select stop_number, total_num_stops, total_num_stops-stop_number as stops_before_end, (100*stop_number::numeric/total_num_stops)::int as route_portion, lateness, trip_stop_weight 
from datamining_table dm 
  natural join trip_stop_weights 
  natural join gps_segments 
  inner join (select count(*) as total_num_stops, trip_id 
              from gtf_stop_times 
              group by trip_id) ns 
    on ns.trip_id = dm.gtfs_trip_id 
where lateness is not null
"""
    db.SQLExec(cur,sql)
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),'rows fetched.'

  # Plot ECDF comparisons
  stop_num_split = DM.split_on_attributes(('stop_number',),rows)
  end_num_split = DM.split_on_attributes(('stops_before_end',),rows)
  halfway_split = DM.split_on_attributes(('route_portion',),rows)

  cdf_dict = { "Second stop" : stop_num_split[(1,)],
               "Middle stop" : halfway_split[(50,)]+halfway_split[(51,)],
               "Next to last stop" : end_num_split[(1,)] }
  compare_ecdfs("Stop Position",cdf_dict);

  # Plot E vs stop number
  Es = []
  moes = []
  sns = array([k[0] for k in stop_num_split.keys()])
  sns.sort()
  for sn in sns:
    rowdata = array([(r['lateness'],r['trip_stop_weight']) for r in stop_num_split[(sn,)]])
    Eval,moe = E(rowdata,weighted=True)
    Es.append(Eval)
    moes.append(moe)
  Es = array(Es)
  moes = array(moes)
  
  figure()
  plot(sns,Es,'k-',label="Estimated expectation")
  plot(sns,Es+moes,'k--',label=None)
  plot(sns,Es-moes,'k--',label=None)
  #legend()
  xlabel("Stop Number")
  ylabel("Expected Latenes")
  title("Expected Lateness vs Stop Number")
Beispiel #8
0
def depict_predinterval_calculation(rows=None,degsep=1,cond=60,alpha=0.05):
  """
  Creates a plot explaining how the prediction interval calculations
  work.
  """
  
  if rows is None:
    print "Selecting..."
    cur = db.get_cursor()
    db.SQLExec(cur,"""
select d2.lateness, trip_stop_weight
from datamining_table d2
natural join trip_stop_weights
inner join datamining_table d1
  on d1.gps_segment_id = d2.gps_segment_id
  and d2.stop_number-d1.stop_number=%(degsep)s
  and d1.lateness = %(cond)s
  and d2.lateness is not null and d1.lateness is not null
""",  {'degsep':degsep,'cond':cond});

    print "Retrieving..."
    rows = cur.fetchall();
    cur.close()
    print len(rows),"rows retrieved."


  figure()

  rowdata = array([(r['lateness'],r['trip_stop_weight']) for r in rows])
  x,p,a_n = ecdf(rowdata,weighted=True)
  
  plot(x,p,'k-',label="Conditional ECDF")
  plot(x,p+a_n,'k--',label="ECDF 95% CI")
  plot(x,p-a_n,'k--',label=None)
  
  (lower,upper),(lolo,upup) = find_pred_interval(x,p,a_n,alpha=alpha)

  plot( (lower,lower),(0,alpha/2), 'r-',label="Lower interval bound")
  plot( (-2000,lower),(alpha/2,alpha/2), 'r-',label=None)

  plot( (upper,upper),(0,1-alpha/2),'g-',label="Upper interval bound")
  plot( (-2000,upper),(1-alpha/2,1-alpha/2),'g-',label=None)

  plot( (lolo,lolo),(0,alpha/2), 'c-',label="Lower bound CI")
  #plot( (-2000,lolo),(alpha/2,alpha/2), 'c-',label=None)

  plot( (upup,upup),(0,1-alpha/2),'m-',label="Upper bound CI")
  #plot( (-2000,upup),(1-alpha/2,1-alpha/2),'m-',label=None)

  legend()
  xlabel("Lateness")
  ylabel("ECDF(Lateness)")
  title("Prediction Interval Calculation")
def get_joined_rows(prev_attrs=(), degree_of_sep=1):
    print "Selecting..."
    if prev_attrs:
        sql = """select d2.*,""" + ",".join(
            map(lambda a: "d1." + a, prev_attrs))
    else:
        sql = """select d2.* """
    sql += """
from datamining_table d1 inner join datamining_table d2
on d1.gps_segment_id = d2.gps_segment_id
and d1.stop_number+""" + str(degree_of_sep) + """=d2.stop_number"""

    cur = db.get_cursor()
    db.SQLExec(cur, sql)
    return cur
def get_routehops():
    print "Selecting..."
    cur = db.get_cursor()
    db.SQLExec(
        cur, """select *
     from datamining_table dm 
     natural join random_gps_segments rgs
     inner join gps_segments gs on dm.gps_segment_id=gs.gps_segment_id
     where lateness is not null
     --and rgs.rand < 0.1
     and gs.trip_date = '2009-03-27'
     and dm.route_name in ('1','9','19')
     and dm.service_id='1'
       --and random() < 0.001""")
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    return rows
Beispiel #11
0
def compare_hour_of_weekday(rows=None):
  """Compares lateness distributions between hours of the weekday"""
  
  if rows is None:
    cur = db.get_cursor()
    sql = """
select scheduled_hour_of_arrival as hoa, lateness, trip_stop_weight
from datamining_table natural join trip_stop_weights
where lateness is not null and service_id='1'
"""
    print "Selecting..."
    db.SQLExec(cur,sql)
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),"rows retrieved."

  compare_ecdfs(('hoa',),rows)
Beispiel #12
0
def compare_routenames(rows=None):
  """Compare lateness distributions between route ID's"""
  
  if rows is None:
    cur = db.get_cursor()
    sql = """
select route_name, lateness, trip_stop_weight
from datamining_table natural join trip_stop_weights
where lateness is not null
"""
    print "Selecting..."
    db.SQLExec(cur,sql)
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close()
    print len(rows),"rows retrieved."

  compare_ecdfs(('route_name',),rows)
Beispiel #13
0
def compare_dows(rows=None):
  """Compares lateness distributions between days of the week"""

  if rows is None:
    cur = db.get_cursor()
    sql = """
select date_part('dow',trip_date) as wday, lateness, trip_stop_weight 
from datamining_table 
  natural join trip_stop_weights
  natural join gps_segments
where lateness is not null
"""
    print "Selecting..."
    db.SQLExec(cur,sql);
    print "Retrieving..."
    rows = cur.fetchall()
    cur.close();
    print len(rows),"rows retrieved."

  compare_ecdfs(('wday',),rows)
  return rows
Beispiel #14
0
def independent_sampling_cdf(rows=None):
  
  if rows is None:
    cur = db.get_cursor();
    rows = []
    print "Selecting..."
    # For each gps_segment we want to randomly select a sample
    for i in range(58903):
      if (i+1)%(58903/100)==0:
        print " %d/%d"%(i+1,58903)
      if random() < 0.5: 
        continue
      db.SQLExec(cur,
"""select lateness,trip_stop_weight 
   from datamining_table dm natural join trip_stop_weights tsw
     where gps_segment_id=%(gseg)s and lateness is not null 
     order by random() limit 1""",
                 {'gseg':i});
      if cur.rowcount > 0:
        srow = cur.fetchall()[0]; #should be just one
        rows.append(srow)
    cur.close()
    print len(rows),"rows retrieved."

  try:
    data = array([(r['lateness'],r['trip_stop_weight']) for r in rows])
    x,p,a_n = ecdf(data,weighted=True)
  
    figure()
    plot(x,p,'k',label="Independent")
    plot(x,p+a_n,'k--',label=None)
    plot(x,p-a_n,'k--',label=None)
    xlabel("lateness")
    ylabel("CDF(lateness)")
    title("CDF of Lateness for Independent Samples")
  except e:
    print e

  return rows
Beispiel #15
0
    def fill_unique_service_combinations(self, dbconn, commit):
        """
    Throughout all dates which have service IDs in effect, this method
    finds every unique combination of service IDs that are in effect
    simultaneously. The service_combinations table is then populated with
    a 1-to-many map from combination_id to service_id where each combination_id
    represents a unique combination of service IDs. If a matching combination_id
    already exists, then it is left alone.
    """

        cur = db.get_cursor()

        ## Find all unique combos in effective service dates, put new
        ## ones in db

        db.SQLExec(
            cur, """select min(start_date), max(end_date) from 
                    (select start_date, end_date from gtf_calendar 
                    union select date as start_date, date as end_date 
                    from gtf_calendar_dates) as t""")

        min_date, max_date = cur.fetchone()[0:2]
        one_day = timedelta(days=1)

        day = min_date
        if isinstance(day, basestring):
            day = ServiceDateHandler.parseDate(day)
            max_date = ServiceDateHandler.parseDate(max_date)

        while day <= max_date:
            service_ids = tuple(self.effective_service_ids(day))

            # If it already exists, don't do anything
            if self.existing_combos.has_key(service_ids):
                pass
            # Otherwise, put it in the db
            else:
                db.SQLExec(
                    cur, """insert into service_combo_ids values (DEFAULT)""")
                db.SQLExec(
                    cur,
                    """select currval('service_combo_ids_combination_id_seq')"""
                )
                combo_id = int(cur.fetchone()[0])

                self.existing_combos[service_ids] = combo_id
                self.combos[combo_id] = service_ids

                insert_sql = """insert into service_combinations (combination_id,
                        service_id) values (%(combo)s,%(service)s)"""

                if __debug__:
                    db.SQLExec(
                        cur, """select count(*) from gtf_trips 
                         where service_id in ('%(sids)s')""" %
                        {'sids': "','".join(service_ids)})
                    print "======== Creating Combo ========="
                    print "ID:", combo_id
                    print "Service IDs:", service_ids
                    print "Trips with ID:", cur.fetchone()[0]
                    print "================================="
                    print

                for service_id in service_ids:
                    db.SQLExec(cur, insert_sql, {
                        'combo': str(combo_id),
                        'service': service_id
                    })

            day = day + one_day

        cur.close()
        if commit:
            dbconn.commit()
Beispiel #16
0
def rows_to_data_arrive(rows):
    return array([(r['lateness_arrive'], r['trip_stop_weight_arrive'])
                  for r in rows if r['lateness_arrive'] is not None])


def rows_to_data_depart(rows):
    return array([(r['lateness_depart'], r['trip_stop_weight_depart'])
                  for r in rows if r['lateness_depart'] is not None])


## Initial retrieval and sorting of data

# Grab data needed
cur = db.get_cursor()
db.SQLExec(cur, Stats.comparison_sql)
rows = cur.fetchall()
cur.close()

# Service ID split
sids = split(('service_id', ), rows)
weekday_rows = sids[('1', )]
saturday_rows = sids[('2', )]
sunday_rows = sids[('3', )]
weekend_rows = saturday_rows + sunday_rows

# DoW split
dows = split(('wday', ), rows)

# Hour of day splits
weekday_hoas = split(('hoa', ), weekday_rows)