def crossesThreshold(start_date, end_date, mahal_timeseries, threshold): for key in [ keyFromDatetime(d) for d in dateRange(start_date, end_date, timedelta(hours=1)) ]: if (key in mahal_timeseries and mahal_timeseries[key] > threshold): return True
def detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, out_file, window_size=6, threshold_quant=.95): logMsg("Detecting events at %d%% bound" % int(threshold_quant*100)) #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # Get the expected global pace (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) start_date = datetime(2010,1,1) end_date = datetime(2014,1,1) shift = timedelta(hours=window_size) prev_above_threshold = False current_event_start = None current_event_end = None eventList = [] for date in dateRange(start_date, end_date, shift): #print #print(str(date)) #print(prev_above_threshold) if(crossesThreshold(date, date+shift, mahal_timeseries, threshold)): #print("CROSS") if(not prev_above_threshold): #print("RESET") current_event_start = date current_event_end = date+shift prev_above_threshold=True else: if(prev_above_threshold): #print("*************OUTPUTTING************") #print("%s -> %s" % (current_event_start, current_event_end)) start_key = keyFromDatetime(current_event_start) end_key = keyFromDatetime(current_event_end) event = computeEventProperties(start_key, end_key, mahal_timeseries, global_pace_timeseries, expected_pace_timeseries, zscore_timeseries, sorted_mahal=sorted_mahal, mahal_threshold=threshold) #Add to list eventList.append(event) prev_above_threshold =False #Sort events by duration, in descending order eventList.sort(key = lambda x: x[5], reverse=True) #Write events to a CSV file w = csv.writer(open(out_file, "w")) w.writerow(["start_date", "end_date", "max_mahal", "mahal_quant", "duration", "hours_above_thresh", "max_pace_dev", "min_pace_dev", "worst_trip"]) for event in eventList: [start_date, end_date, max_mahal, mahal_quant, duration, hours_above_thresh, max_pace_dev, min_pace_dev, worst_trip] = event formattedEvent = [start_date, end_date, "%.2f" % max_mahal, "%.3f" % mahal_quant, duration, hours_above_thresh, "%.2f" % max_pace_dev, "%.2f" % min_pace_dev, worst_trip] w.writerow(formattedEvent) return eventList
def crossesThreshold(start_date, end_date, mahal_timeseries, threshold): for key in [keyFromDatetime(d) for d in dateRange(start_date, end_date, timedelta(hours=1))]: if(key in mahal_timeseries and mahal_timeseries[key] > threshold): return True
def detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, out_file, window_size=6, threshold_quant=.95): logMsg("Detecting events at %d%% bound" % int(threshold_quant * 100)) #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # Get the expected global pace (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) start_date = datetime(2010, 1, 1) end_date = datetime(2014, 1, 1) shift = timedelta(hours=window_size) prev_above_threshold = False current_event_start = None current_event_end = None eventList = [] for date in dateRange(start_date, end_date, shift): #print #print(str(date)) #print(prev_above_threshold) if (crossesThreshold(date, date + shift, mahal_timeseries, threshold)): #print("CROSS") if (not prev_above_threshold): #print("RESET") current_event_start = date current_event_end = date + shift prev_above_threshold = True else: if (prev_above_threshold): #print("*************OUTPUTTING************") #print("%s -> %s" % (current_event_start, current_event_end)) start_key = keyFromDatetime(current_event_start) end_key = keyFromDatetime(current_event_end) event = computeEventProperties(start_key, end_key, mahal_timeseries, global_pace_timeseries, expected_pace_timeseries, zscore_timeseries, sorted_mahal=sorted_mahal, mahal_threshold=threshold) #Add to list eventList.append(event) prev_above_threshold = False #Sort events by duration, in descending order eventList.sort(key=lambda x: x[5], reverse=True) #Write events to a CSV file w = csv.writer(open(out_file, "w")) w.writerow([ "start_date", "end_date", "max_mahal", "mahal_quant", "duration", "hours_above_thresh", "max_pace_dev", "min_pace_dev", "worst_trip" ]) for event in eventList: [ start_date, end_date, max_mahal, mahal_quant, duration, hours_above_thresh, max_pace_dev, min_pace_dev, worst_trip ] = event formattedEvent = [ start_date, end_date, "%.2f" % max_mahal, "%.3f" % mahal_quant, duration, hours_above_thresh, "%.2f" % max_pace_dev, "%.2f" % min_pace_dev, worst_trip ] w.writerow(formattedEvent) return eventList
def load_pace_data(perc_data_threshold, pool=DefaultPool()): weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] # Connect to the database adn get hte available dates logMsg("Getting relevant dates.") db_main.connect("db_functions/database.conf") # dates = db_travel_times.get_available_dates() dates = list(dateRange(datetime(2014, 06, 01), datetime(2014, 07, 01))) """ Only Do Once for the whole dataset and store in link_counts_chicago table""" # logMsg ("Computing consistent link set") # compute_all_link_counts(dates, pool=pool) logMsg("Loading consistent link set") consistent_link_set = load_consistent_link_set(dates, perc_data_threshold) if len(consistent_link_set) == 0: logMsg("Find 0 consistent_links. Return.") return else: print ("len of consistent_link_set", len(consistent_link_set)) db_main.close() logMsg("Generating vectors") # Initialize dictionaries pace_timeseries = {} pace_grouped = defaultdict(list) dates_grouped = defaultdict(list) weights_grouped = defaultdict(list) # Split the dates into several pieces and use parallel processing to load the # vectors for each of these dates. We will use a partial function to hold the # consistent_link_set constant across all dates it = splitList(dates, pool._processes) load_pace_vectors_consistent = partial(load_pace_vectors, consistent_link_set=consistent_link_set) list_of_lists = pool.map(load_pace_vectors_consistent, it) logMsg("Merging outputs.") # Flatten the vectors into one big list vects = [vect for vect_lst, weight_lst in list_of_lists for vect in vect_lst] weights = [weight for vect_lst, weight_lst in list_of_lists for weight in weight_lst] # Loop through all dates - one vector will be created for each one for i in xrange(len(dates)): date = dates[i] vect = vects[i] weight = weights[i] # Extract the date, hour of day, and day of week just_date = str(date.date()) hour = date.hour weekday = weekday_names[date.weekday()] # Save vector in the timeseries # save the vector into the group # pace_grouped[(weekday, hour)].append(vect) # weights_grouped[(weekday, hour)].append(weight) # dates_grouped[(weekday, hour)].append(just_date) # use constant as key for this moment # weekday = 0 # hour = 0 # print just_date pace_timeseries[(just_date, hour, weekday)] = vect # print "vect here =========", vect pace_grouped[(weekday, hour)].append(vect) weights_grouped[(weekday, hour)].append(weight) dates_grouped[(weekday, hour)].append(just_date) # print pace_timeseries.keys() print len(pace_grouped[(0, 0)]), len(pace_grouped[(0, 0)][0]) # Assign trip names based on node ids trip_names = ["%d" % link_id for link_id in consistent_link_set] # print " len", len(pace_grouped.values()) return (pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set)