def splitDataByTimespans(datalist, timespan, dateinfoname="eval_date"): print("Performing splitDataByTimespans") date_list = sorted(set([d[dateinfoname] for d in datalist])) earliest_date = date_list[0] latest_date = date_list[-1] daterange_list = generateDateRange(start=earliest_date, end=latest_date + _day, step=timespan) data_by_daterange = defaultdict(list) for d in datalist: d_time = d[dateinfoname] for t in daterange_list: if d_time >= t and d_time < generateLaterDate(t, timespan): data_by_daterange[t].append(d) break print("Ending splitDataByTimespans") return data_by_daterange
#train_len_sweep = ["4W"] #multi-option not fully implemented # Length of testing data test_len = "1D" #test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented # Time step between different experiments #test_date_step = "1D" # We have currently decided to step forward the experiment so that test sets # do not overlap, the reasoning being roughly: why would we bother evaluating # a model on 7 days of data if we're about to retrain the model 1 day later? # In the future it is possible this may change if we find a compelling reason # otherwise, or may add an option to override this choice. test_date_step = test_len # List of all experiment dates start_test_list = generateDateRange(earliest_test_date, latest_test_date, test_date_step) # Number of different experiment dates total_num_exp_dates = len(start_test_list) coverage_rate_sweep = [0.01, 0.02, 0.05, 0.10] cell_sampling = 15 #!!! need to find where to use this, for rhs # Knox statistic parameters #knox_space_bin_size = 100 #knox_space_bin_count = 5 #knox_space_bins = [(i*knox_space_bin_size,(i+1)*knox_space_bin_size) \ # for i in range(knox_space_bin_count)] #print(knox_space_bins) #knox_time_bin_size = 3 #knox_time_bin_count = 7
knox_tbin_num = 10 knox_sbins = makeBins(knox_sbin_size, knox_sbin_num) knox_tbins = makeBins(knox_tbin_size, knox_tbin_num) earliest_time = "2017-05-01" latest_time = "2017-08-02" time_step = "6M" time_len = "12M" date_ref = "".join(earliest_time[2:].split("-")) start_times = generateDateRange(earliest_time, latest_time, time_step) num_exp = len(start_times) print(start_times[0]) print(start_times[-1]) print("Num exp: {}".format(num_exp)) outfilebase = \ "knoxB_ssX_burg_sbin{}-{}_tbin{}-{}_iter{}_{}-{}_{}-{}.txt".format( knox_sbin_num, knox_sbin_size, knox_tbin_num,
# (i.e., not the full rectangular grid, only relevant cells) num_cells_region = len(cellcoordlist_region) obtain_reg_end_time = time.time() print("...obtained region.") print("Time: {}".format(obtain_reg_end_time - obtain_reg_start_time)) train_len = "4W" test_len = "1D" all_exp_results = [] test_data_counts = [] test_data_dates = [] exp_times = [] start_test_list = generateDateRange("2018-01-01", "2018-02-01", "1D") total_num_exp = len(start_test_list) for exp_index, start_test in enumerate(start_test_list): exp_start_time = time.time() if exp_index % 10 == 0: print("Running experiment {}/{}...".format(exp_index, total_num_exp)) # Declare time ranges of training and testing data end_train = start_test start_train = generateEarlierDate(end_train, train_len) end_test = generateLaterDate(start_test, test_len) test_data_dates.append(start_test)
# Length of testing data test_len = "7D" #test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented # Time step between different experiments #test_date_step = "1D" # We have currently decided to step forward the experiment so that test sets # do not overlap, the reasoning being roughly: why would we bother evaluating # a model on 7 days of data if we're about to retrain the model 1 day later? # In the future it is possible this may change if we find a compelling reason # otherwise, or may add an option to override this choice. test_date_step = test_len # List of all experiment dates start_test_list = generateDateRange(start=earliest_test_date, end=latest_test_date, step=test_date_step) # Number of different experiment dates total_num_exp_dates = len(start_test_list) coverage_rate_sweep = [0.01, 0.02, 0.05, 0.10] cell_sampling = 15 #!!! need to find where to use this, for rhs # Knox statistic parameters #knox_space_bin_size = 100 #knox_space_bin_count = 5 #knox_space_bins = [(i*knox_space_bin_size,(i+1)*knox_space_bin_size) \ # for i in range(knox_space_bin_count)] #print(knox_space_bins) #knox_time_bin_size = 3
def make_knox_info_file( datadir, in_csv_file_name, out_knox_file_name, geojson_file_name, crime_types, num_knox_iterations, knox_sbin_size, knox_sbin_num, knox_tbin_size, knox_tbin_num, earliest_exp_time, num_exp, time_step, time_len, csv_date_format="%m/%d/%Y %I:%M:%S %p", csv_longlat=False, csv_epsg=None, csv_infeet=True, csv_has_header=True, ): # Normalised and derived parameters # Normalised data directory datadir = os.path.expanduser(os.path.normpath(datadir)) # Full paths to files in_csv_full_path = os.path.join(datadir, in_csv_file_name) geojson_full_path = os.path.join(datadir, geojson_file_name) # Set of relevant crime types in the data crime_type_set = set(splitCommaArgs(crime_types)) # Spatial and temporal bandwidth bins knox_sbins = makeBins(knox_sbin_size, knox_sbin_num) knox_tbins = makeBins(knox_tbin_size, knox_tbin_num) earliest_start_time = generateEarlierDate(earliest_exp_time, time_len) print(f"First time window is from \ {earliest_start_time} to {earliest_exp_time}") start_times = generateDateRange(start=earliest_start_time, step=time_step, num=num_exp) out_file_path = os.path.join(datadir, out_knox_file_name) print(f"outfile: {out_file_path}") # Obtain crime data points, and region polygon # Obtain all crimes (of relevant types) from input data points_crime = loadGenericData(in_csv_full_path, crime_type_set=crime_type_set, date_format_csv=csv_date_format, longlat=csv_longlat, epsg=csv_epsg, infeet=csv_infeet, has_header=csv_has_header) # Obtain polygon from geojson file (which should have been pre-processed) region_polygon = gpd.read_file(geojson_full_path).unary_union # Get subset of input crime that occurred within region points_crime_region = open_cp.geometry.intersect_timed_points( points_crime, region_polygon) total_num_events = len(points_crime_region.timestamps) print(f"Successfully obtained data, with {total_num_events} events.") # Do Knox runs and store info in file print(f"Opening file {out_file_path} for writing.") with open(out_file_path, "w") as fout: chkpt_0 = time.time() for exp_index, start_time in enumerate(start_times): chkpt_1 = time.time() end_time = generateLaterDate(start_time, time_len) print(f"Time span: {start_time} to {end_time}") ### SELECT TRAINING DATA chkpt_2 = time.time() print(f"Getting data subset...") # Get subset of data for training points_crime_region_train = getTimedPointsInTimeRange( points_crime_region, start_time, end_time) print(f"...Got data subset. ({time.time()-chkpt_2:.4f})") num_events = len(points_crime_region_train.timestamps) print(f"Number of events in timespan: {num_events}") chkpt_3 = time.time() print("Calculating Knox...") knox_result = getKnoxResult(points_crime_region_train, num_knox_iterations, knox_sbins, knox_tbins) print(f"...Calculated Knox. ({time.time()-chkpt_3:.4f})") chkpt_4 = time.time() print(f"Writing to file {out_file_path} ...") fout.write(str(start_time)) fout.write("\n") fout.write(str(end_time)) fout.write("\n") fout.write(str(time_len)) fout.write("\n") fout.write(str(num_events)) fout.write("\n") fout.write("Spatial bins (columns):") fout.write("\n") fout.write(str(knox_sbins)) fout.write("\n") fout.write("Temporal bins (rows):") fout.write("\n") fout.write(str(knox_tbins)) fout.write("\n") fout.write("Knox Statistics\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(knox_result.statistic(j, i)) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("Monte Carlo Medians\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(statistics.median(knox_result.distribution(j, i))) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("P Values\n") for i in range(knox_tbin_num): fout.write(" ".join([ str(knox_result.pvalue(j, i)) for j in range(knox_sbin_num) ])) fout.write("\n") fout.write("\n") print(f"...Wrote to file. ({time.time()-chkpt_4:.4f})") print(f"Time for this run: {time.time()-chkpt_1:.4f}") print(f"Number of runs: {len(start_times)}") print(f"Number of bins per run: {len(knox_sbins) * len(knox_tbins)}") print(f"Overall time: {time.time()-chkpt_0:.4f}")