def multiple_op(M, O=None, gamma=None, tol_perc=1e-06): best_eps = None best_L = None best_C = None best_term_crit = None best_k = None best_obj = float("inf") for eps_ratio in [2, 5, 10, 20, 30, 50]: try: logMsg("Trying eps=%d" % eps_ratio) (L, C, term_crit, k) = opursuit(M, O=O, gamma=gamma, tol_perc=tol_perc, eps_ratio=eps_ratio) obj = obj_func(L, C, gamma) logMsg("%f after %d" % (obj, k)) if constraint(L, C, M, O, tol_perc): obj = obj_func(L, C, gamma) if obj < best_obj: best_obj = obj best_eps = eps_ratio best_L = L best_C = C best_term_crit = term_crit best_k = k else: logMsg("$$$$$$ Not satisfied at %d" % eps_ratio) tmp = obj_func(L, C, gamma) except: pass sys.stdout.flush() logMsg("$$$$$$ Best eps: %d" % best_eps) sys.stdout.flush() return best_L, best_C, best_term_crit, best_k
def drawFigure(filename, road_map, num_obs): logMsg("Writing " + filename) with open(filename, 'w') as f: csvw = csv.writer(f) csvw.writerow(['begin_node','end_node', 'begin_lat', 'begin_lon', 'end_lat', 'end_lon', 'avg_num_trips']) for begin_node_id, end_node_id in sorted(num_obs, key= lambda x: num_obs[x], reverse=True): if((begin_node_id, end_node_id) in road_map.links_by_node_id): begin_node = road_map.nodes_by_id[begin_node_id] end_node = road_map.nodes_by_id[end_node_id] csvw.writerow([begin_node_id, end_node_id, begin_node.lat, begin_node.long, end_node.lat, end_node.long, num_obs[begin_node_id, end_node_id]])
def compute_all_link_counts(dates, pool=DefaultPool()): # Split the list and compute the link counts of all slices in parallel it = splitList(dates, pool._processes) num_obs_list = pool.map(compute_link_counts, it) # print "1", num_obs_list[0] # Merge the outputs by summing each link count merged_count_obs = defaultdict(float) for num_appearances in num_obs_list: for key in num_appearances: merged_count_obs[key] += num_appearances[key] # Divide the sums by the total number of dates, in order to get the average for key in merged_count_obs: merged_count_obs[key] /= len(dates) print "keys", len(merged_count_obs.keys()) db_main.connect("db_functions/database.conf") logMsg("Creating") db_travel_times.create_link_counts_table_new() logMsg("Saving") # Issue of num of arguments db_travel_times.save_link_counts_new(merged_count_obs)
def compute_all_link_counts(dates, pool=DefaultPool()): # Split the list and compute the link counts of all slices in parallel it = splitList(dates, pool._processes) num_obs_list = pool.map(compute_link_counts, it) # print "1", num_obs_list[0] # Merge the outputs by summing each link count merged_count_obs = defaultdict(float) for num_appearances in num_obs_list: for key in num_appearances: merged_count_obs[key] += num_appearances[key] # Divide the sums by the total number of dates, in order to get the average for key in merged_count_obs: merged_count_obs[key] /= len(dates) print "keys", len(merged_count_obs.keys()) db_main.connect('db_functions/database.conf') logMsg("Creating") db_travel_times.create_link_counts_table_new() logMsg("Saving") # Issue of num of arguments db_travel_times.save_link_counts_new(merged_count_obs)
def multiple_op(M, O=None, gamma=None, tol_perc=1e-06): best_eps = None best_L = None best_C = None best_term_crit = None best_k = None best_obj = float('inf') for eps_ratio in [2, 5, 10, 20, 30, 50]: try: logMsg("Trying eps=%d" % eps_ratio) (L, C, term_crit, k) = opursuit(M, O=O, gamma=gamma, tol_perc=tol_perc, eps_ratio=eps_ratio) obj = obj_func(L, C, gamma) logMsg("%f after %d" % (obj, k)) if (constraint(L, C, M, O, tol_perc)): obj = obj_func(L, C, gamma) if (obj < best_obj): best_obj = obj best_eps = eps_ratio best_L = L best_C = C best_term_crit = term_crit best_k = k else: logMsg("$$$$$$ Not satisfied at %d" % eps_ratio) tmp = obj_func(L, C, gamma) except: pass sys.stdout.flush() logMsg("$$$$$$ Best eps: %d" % best_eps) sys.stdout.flush() return best_L, best_C, best_term_crit, best_k
events = [] for event in events_data: (name, date_str, duration) = event start_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") end_date = start_date + timedelta(hours=duration) events.append((name, start_date, end_date, duration)) event_durations = {} #k_vals = [7,8,9,10,15,20,25,30,35,40,45,50] k_vals = range(7,51) for k in k_vals: event_file = 'results/coarse_events_k%d.csv' % k logMsg('Examining %s' % event_file) for event in events: event_durations[event, k] = 0 for event in events: print (event, k) (ev_name, ev_start, ev_end, ev_duration) = event for [start_date, end_date, duration, max_mahal, max_pace_dev, min_pace_dev] in events: #start_date = datetime.strptime(start_date_str, "%Y-%m-%d %H:%M:%S") #end_date = datetime.strptime(end_date_str, "%Y-%m-%d %H:%M:%S") duration = (end_date - start_date).total_seconds() / 3600 + 1
def detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, out_file, window_size=6, threshold_quant=.95): logMsg("Detecting events at %d%% bound" % int(threshold_quant*100)) #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # Get the expected global pace (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) start_date = datetime(2010,1,1) end_date = datetime(2014,1,1) shift = timedelta(hours=window_size) prev_above_threshold = False current_event_start = None current_event_end = None eventList = [] for date in dateRange(start_date, end_date, shift): #print #print(str(date)) #print(prev_above_threshold) if(crossesThreshold(date, date+shift, mahal_timeseries, threshold)): #print("CROSS") if(not prev_above_threshold): #print("RESET") current_event_start = date current_event_end = date+shift prev_above_threshold=True else: if(prev_above_threshold): #print("*************OUTPUTTING************") #print("%s -> %s" % (current_event_start, current_event_end)) start_key = keyFromDatetime(current_event_start) end_key = keyFromDatetime(current_event_end) event = computeEventProperties(start_key, end_key, mahal_timeseries, global_pace_timeseries, expected_pace_timeseries, zscore_timeseries, sorted_mahal=sorted_mahal, mahal_threshold=threshold) #Add to list eventList.append(event) prev_above_threshold =False #Sort events by duration, in descending order eventList.sort(key = lambda x: x[5], reverse=True) #Write events to a CSV file w = csv.writer(open(out_file, "w")) w.writerow(["start_date", "end_date", "max_mahal", "mahal_quant", "duration", "hours_above_thresh", "max_pace_dev", "min_pace_dev", "worst_trip"]) for event in eventList: [start_date, end_date, max_mahal, mahal_quant, duration, hours_above_thresh, max_pace_dev, min_pace_dev, worst_trip] = event formattedEvent = [start_date, end_date, "%.2f" % max_mahal, "%.3f" % mahal_quant, duration, hours_above_thresh, "%.2f" % max_pace_dev, "%.2f" % min_pace_dev, worst_trip] w.writerow(formattedEvent) return eventList
events= detectWindowedEvents(mahal_timeseries_fine, zscore_timeseries, global_pace_timeseries, "results/link_20_normalize_events_windowed.csv", window_size=window_size, threshold_quant=threshold) duration = getEventDuration(events, "2012-10-31") w.writerow(["fine", window_size, threshold, duration]) if(__name__=="__main__"): #performEventDurationTest() mahal_timeseries = readOutlierScores("results/outlier_scores.csv") global_pace_timeseries = readGlobalPace("4year_features") zscore_timeseries = readZScoresTimeseries("results/zscore.csv") detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, "results/events_windowed.csv", window_size=8, threshold_quant=.95) mahal_timeseries = readOutlierScores("results/link_20_normalize_outlier_scores.csv") global_pace_timeseries = readGlobalPace("4year_features") zscore_timeseries = readZScoresTimeseries("results/zscore.csv") detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, "results/link_20_normalize_events_windowed.csv", window_size=8, threshold_quant=.95) logMsg("done")
w.writerows(events) if(__name__=="__main__"): find_most_common_event_durations('results/coarse_montecarlo.csv', 'results/coarse_events_consensus.csv') find_most_common_event_durations('results/fine_montecarlo.csv', 'results/fine_events_consensus.csv') if(False): run_random_sims('results/coarse_features_imb20_k10_RPCAtune_10000000pcs_5percmiss_robust_outlier_scores.csv','4year_features') test_median() print beta_method_of_moments(.99, .03) print beta_method_of_moments(.95, .05) print beta_method_of_moments(.6, .05) if(False): logMsg("Starting") run_sims_in_parallel('results/coarse_features_imb20_k10_RPCAtune_10000000pcs_5percmiss_robust_outlier_scores.csv', '4year_features', 'results/coarse_montecarlo.csv') logMsg("Finished OD method") run_sims_in_parallel('results/link_features_imb20_k10_RPCAtune_10000000pcs_5percmiss_robust_outlier_scores.csv', '4year_features', 'results/fine_montecarlo.csv') logMsg("Finished link-level method")
def detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, out_file, window_size=6, threshold_quant=.95): logMsg("Detecting events at %d%% bound" % int(threshold_quant * 100)) #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # Get the expected global pace (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) start_date = datetime(2010, 1, 1) end_date = datetime(2014, 1, 1) shift = timedelta(hours=window_size) prev_above_threshold = False current_event_start = None current_event_end = None eventList = [] for date in dateRange(start_date, end_date, shift): #print #print(str(date)) #print(prev_above_threshold) if (crossesThreshold(date, date + shift, mahal_timeseries, threshold)): #print("CROSS") if (not prev_above_threshold): #print("RESET") current_event_start = date current_event_end = date + shift prev_above_threshold = True else: if (prev_above_threshold): #print("*************OUTPUTTING************") #print("%s -> %s" % (current_event_start, current_event_end)) start_key = keyFromDatetime(current_event_start) end_key = keyFromDatetime(current_event_end) event = computeEventProperties(start_key, end_key, mahal_timeseries, global_pace_timeseries, expected_pace_timeseries, zscore_timeseries, sorted_mahal=sorted_mahal, mahal_threshold=threshold) #Add to list eventList.append(event) prev_above_threshold = False #Sort events by duration, in descending order eventList.sort(key=lambda x: x[5], reverse=True) #Write events to a CSV file w = csv.writer(open(out_file, "w")) w.writerow([ "start_date", "end_date", "max_mahal", "mahal_quant", "duration", "hours_above_thresh", "max_pace_dev", "min_pace_dev", "worst_trip" ]) for event in eventList: [ start_date, end_date, max_mahal, mahal_quant, duration, hours_above_thresh, max_pace_dev, min_pace_dev, worst_trip ] = event formattedEvent = [ start_date, end_date, "%.2f" % max_mahal, "%.3f" % mahal_quant, duration, hours_above_thresh, "%.2f" % max_pace_dev, "%.2f" % min_pace_dev, worst_trip ] w.writerow(formattedEvent) return eventList
threshold_quant=threshold) duration = getEventDuration(events, "2012-10-31") w.writerow(["fine", window_size, threshold, duration]) if (__name__ == "__main__"): #performEventDurationTest() mahal_timeseries = readOutlierScores("results/outlier_scores.csv") global_pace_timeseries = readGlobalPace("4year_features") zscore_timeseries = readZScoresTimeseries("results/zscore.csv") detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, "results/events_windowed.csv", window_size=8, threshold_quant=.95) mahal_timeseries = readOutlierScores( "results/link_20_normalize_outlier_scores.csv") global_pace_timeseries = readGlobalPace("4year_features") zscore_timeseries = readZScoresTimeseries("results/zscore.csv") detectWindowedEvents(mahal_timeseries, zscore_timeseries, global_pace_timeseries, "results/link_20_normalize_events_windowed.csv", window_size=8, threshold_quant=.95) logMsg("done")
def load_pace_data(perc_data_threshold, pool=DefaultPool()): weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] # Connect to the database adn get hte available dates logMsg("Getting relevant dates.") db_main.connect("db_functions/database.conf") # dates = db_travel_times.get_available_dates() dates = list(dateRange(datetime(2014, 06, 01), datetime(2014, 07, 01))) """ Only Do Once for the whole dataset and store in link_counts_chicago table""" # logMsg ("Computing consistent link set") # compute_all_link_counts(dates, pool=pool) logMsg("Loading consistent link set") consistent_link_set = load_consistent_link_set(dates, perc_data_threshold) if len(consistent_link_set) == 0: logMsg("Find 0 consistent_links. Return.") return else: print ("len of consistent_link_set", len(consistent_link_set)) db_main.close() logMsg("Generating vectors") # Initialize dictionaries pace_timeseries = {} pace_grouped = defaultdict(list) dates_grouped = defaultdict(list) weights_grouped = defaultdict(list) # Split the dates into several pieces and use parallel processing to load the # vectors for each of these dates. We will use a partial function to hold the # consistent_link_set constant across all dates it = splitList(dates, pool._processes) load_pace_vectors_consistent = partial(load_pace_vectors, consistent_link_set=consistent_link_set) list_of_lists = pool.map(load_pace_vectors_consistent, it) logMsg("Merging outputs.") # Flatten the vectors into one big list vects = [vect for vect_lst, weight_lst in list_of_lists for vect in vect_lst] weights = [weight for vect_lst, weight_lst in list_of_lists for weight in weight_lst] # Loop through all dates - one vector will be created for each one for i in xrange(len(dates)): date = dates[i] vect = vects[i] weight = weights[i] # Extract the date, hour of day, and day of week just_date = str(date.date()) hour = date.hour weekday = weekday_names[date.weekday()] # Save vector in the timeseries # save the vector into the group # pace_grouped[(weekday, hour)].append(vect) # weights_grouped[(weekday, hour)].append(weight) # dates_grouped[(weekday, hour)].append(just_date) # use constant as key for this moment # weekday = 0 # hour = 0 # print just_date pace_timeseries[(just_date, hour, weekday)] = vect # print "vect here =========", vect pace_grouped[(weekday, hour)].append(vect) weights_grouped[(weekday, hour)].append(weight) dates_grouped[(weekday, hour)].append(just_date) # print pace_timeseries.keys() print len(pace_grouped[(0, 0)]), len(pace_grouped[(0, 0)][0]) # Assign trip names based on node ids trip_names = ["%d" % link_id for link_id in consistent_link_set] # print " len", len(pace_grouped.values()) return (pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set)
def load_pace_data(perc_data_threshold=.95, pool=DefaultPool()): weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] # Connect to the database adn get hte available dates logMsg ("Getting relevant dates.") db_main.connect('db_functions/database.conf') dates = db_travel_times.get_available_dates() #dates = list(dateRange(datetime(2012,10,21), datetime(2012,11,11))) #logMsg ("Computing consistent link set") #compute_all_link_counts(dates, pool=pool) logMsg("Loading consistent link set") consistent_link_set = load_consistent_link_set(dates, perc_data_threshold) db_main.close() logMsg("Generating vectors") #Initialize dictionaries pace_timeseries = {} pace_grouped = defaultdict(list) dates_grouped = defaultdict(list) weights_grouped = defaultdict(list) # Split the dates into several pieces and use parallel processing to load the # vectors for each of these dates. We will use a partial function to hold the # consistent_link_set constant across all dates it = splitList(dates, pool._processes) load_pace_vectors_consistent = partial(load_pace_vectors, consistent_link_set=consistent_link_set) list_of_lists = pool.map(load_pace_vectors_consistent, it) logMsg("Merging outputs.") # Flatten the vectors into one big list vects = [vect for vect_lst, weight_lst in list_of_lists for vect in vect_lst] weights = [weight for vect_lst, weight_lst in list_of_lists for weight in weight_lst] # Loop through all dates - one vector will be created for each one for i in xrange(len(dates)): date = dates[i] vect = vects[i] weight = weights[i] # Extract the date, hour of day, and day of week just_date = str(date.date()) hour = date.hour weekday = weekday_names[date.weekday()] #Save vector in the timeseries pace_timeseries[(just_date, hour, weekday)] = vect #save the vector into the group pace_grouped[(weekday, hour)].append(vect) weights_grouped[(weekday, hour)].append(weight) dates_grouped[(weekday, hour)].append(just_date) # Assign trip names based on node ids trip_names = ["%d-->%d"%(start, end) for (start, end) in consistent_link_set] return (pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set)
def load_pace_data(perc_data_threshold, pool=DefaultPool()): weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] # Connect to the database adn get hte available dates logMsg ("Getting relevant dates.") db_main.connect('db_functions/database.conf') # dates = db_travel_times.get_available_dates() dates = list(dateRange(datetime(2014,06,01), datetime(2014,07,01))) ''' Only Do Once for the whole dataset and store in link_counts_chicago table''' #logMsg ("Computing consistent link set") #compute_all_link_counts(dates, pool=pool) logMsg("Loading consistent link set") consistent_link_set = load_consistent_link_set(dates, perc_data_threshold) if len(consistent_link_set) == 0: logMsg("Find 0 consistent_links. Return.") return else: print("len of consistent_link_set", len(consistent_link_set)) db_main.close() logMsg("Generating vectors") #Initialize dictionaries pace_timeseries = {} pace_grouped = defaultdict(list) dates_grouped = defaultdict(list) weights_grouped = defaultdict(list) # Split the dates into several pieces and use parallel processing to load the # vectors for each of these dates. We will use a partial function to hold the # consistent_link_set constant across all dates it = splitList(dates, pool._processes) load_pace_vectors_consistent = partial(load_pace_vectors, consistent_link_set=consistent_link_set) list_of_lists = pool.map(load_pace_vectors_consistent, it) logMsg("Merging outputs.") # Flatten the vectors into one big list vects = [vect for vect_lst, weight_lst in list_of_lists for vect in vect_lst] weights = [weight for vect_lst, weight_lst in list_of_lists for weight in weight_lst] # Loop through all dates - one vector will be created for each one for i in xrange(len(dates)): date = dates[i] vect = vects[i] weight = weights[i] # Extract the date, hour of day, and day of week just_date = str(date.date()) hour = date.hour weekday = weekday_names[date.weekday()] #Save vector in the timeseries #save the vector into the group # pace_grouped[(weekday, hour)].append(vect) # weights_grouped[(weekday, hour)].append(weight) # dates_grouped[(weekday, hour)].append(just_date) # use constant as key for this moment # weekday = 0 # hour = 0 # print just_date pace_timeseries[(just_date, hour, weekday)] = vect # print "vect here =========", vect pace_grouped[(weekday, hour)].append(vect) weights_grouped[(weekday, hour)].append(weight) dates_grouped[(weekday, hour)].append(just_date) # print pace_timeseries.keys() print len(pace_grouped[(0,0)]), len(pace_grouped[(0,0)][0]) # Assign trip names based on node ids trip_names = ["%d" % link_id for link_id in consistent_link_set] # print " len", len(pace_grouped.values()) return (pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set)
("?", "2013-10-12 01:00:00", 43), ("?", "2013-09-28 08:00:00", 37)] events = [] for event in events_data: (name, date_str, duration) = event start_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") end_date = start_date + timedelta(hours=duration) events.append((name, start_date, end_date, duration)) event_durations = {} #k_vals = [7,8,9,10,15,20,25,30,35,40,45,50] k_vals = range(7, 51) for k in k_vals: event_file = 'results/coarse_events_k%d.csv' % k logMsg('Examining %s' % event_file) for event in events: event_durations[event, k] = 0 for event in events: print(event, k) (ev_name, ev_start, ev_end, ev_duration) = event for [ start_date, end_date, duration, max_mahal, max_pace_dev, min_pace_dev ] in events: #start_date = datetime.strptime(start_date_str, "%Y-%m-%d %H:%M:%S") #end_date = datetime.strptime(end_date_str, "%Y-%m-%d %H:%M:%S") duration = (end_date - start_date).total_seconds() / 3600 + 1