Esempio n. 1
0
def splitDataByTimespans(datalist, timespan, dateinfoname="eval_date"):
    print("Performing splitDataByTimespans")
    date_list = sorted(set([d[dateinfoname] for d in datalist]))
    earliest_date = date_list[0]
    latest_date = date_list[-1]
    daterange_list = generateDateRange(start=earliest_date,
                                       end=latest_date + _day,
                                       step=timespan)
    data_by_daterange = defaultdict(list)
    for d in datalist:
        d_time = d[dateinfoname]
        for t in daterange_list:
            if d_time >= t and d_time < generateLaterDate(t, timespan):
                data_by_daterange[t].append(d)
                break
    print("Ending splitDataByTimespans")
    return data_by_daterange
Esempio n. 2
0
#train_len_sweep = ["4W"] #multi-option not fully implemented
# Length of testing data
test_len = "1D"
#test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented

# Time step between different experiments
#test_date_step = "1D"
# We have currently decided to step forward the experiment so that test sets
#  do not overlap, the reasoning being roughly: why would we bother evaluating
#  a model on 7 days of data if we're about to retrain the model 1 day later?
#  In the future it is possible this may change if we find a compelling reason
#  otherwise, or may add an option to override this choice.
test_date_step = test_len

# List of all experiment dates
start_test_list = generateDateRange(earliest_test_date, latest_test_date,
                                    test_date_step)
# Number of different experiment dates
total_num_exp_dates = len(start_test_list)

coverage_rate_sweep = [0.01, 0.02, 0.05, 0.10]

cell_sampling = 15  #!!! need to find where to use this, for rhs

# Knox statistic parameters
#knox_space_bin_size = 100
#knox_space_bin_count = 5
#knox_space_bins = [(i*knox_space_bin_size,(i+1)*knox_space_bin_size) \
#                   for i in range(knox_space_bin_count)]
#print(knox_space_bins)
#knox_time_bin_size = 3
#knox_time_bin_count = 7
Esempio n. 3
0
knox_tbin_num = 10

knox_sbins = makeBins(knox_sbin_size, knox_sbin_num)
knox_tbins = makeBins(knox_tbin_size, knox_tbin_num)




earliest_time = "2017-05-01"
latest_time = "2017-08-02"
time_step = "6M"
time_len = "12M"
date_ref = "".join(earliest_time[2:].split("-"))


start_times = generateDateRange(earliest_time, latest_time, time_step)

num_exp = len(start_times)

print(start_times[0])
print(start_times[-1])
print("Num exp: {}".format(num_exp))




outfilebase = \
    "knoxB_ssX_burg_sbin{}-{}_tbin{}-{}_iter{}_{}-{}_{}-{}.txt".format(
                                            knox_sbin_num, 
                                            knox_sbin_size, 
                                            knox_tbin_num, 
Esempio n. 4
0
# (i.e., not the full rectangular grid, only relevant cells)
num_cells_region = len(cellcoordlist_region)

obtain_reg_end_time = time.time()
print("...obtained region.")
print("Time: {}".format(obtain_reg_end_time - obtain_reg_start_time))

train_len = "4W"
test_len = "1D"

all_exp_results = []
test_data_counts = []
test_data_dates = []
exp_times = []

start_test_list = generateDateRange("2018-01-01", "2018-02-01", "1D")

total_num_exp = len(start_test_list)
for exp_index, start_test in enumerate(start_test_list):

    exp_start_time = time.time()

    if exp_index % 10 == 0:
        print("Running experiment {}/{}...".format(exp_index, total_num_exp))

    # Declare time ranges of training and testing data
    end_train = start_test
    start_train = generateEarlierDate(end_train, train_len)
    end_test = generateLaterDate(start_test, test_len)

    test_data_dates.append(start_test)
Esempio n. 5
0
# Length of testing data
test_len = "7D"
#test_len_sweep = ["1D","3D","7D"] #multi-option not fully implemented

# Time step between different experiments
#test_date_step = "1D"
# We have currently decided to step forward the experiment so that test sets
#  do not overlap, the reasoning being roughly: why would we bother evaluating
#  a model on 7 days of data if we're about to retrain the model 1 day later?
#  In the future it is possible this may change if we find a compelling reason
#  otherwise, or may add an option to override this choice.
test_date_step = test_len

# List of all experiment dates
start_test_list = generateDateRange(start=earliest_test_date,
                                    end=latest_test_date,
                                    step=test_date_step)
# Number of different experiment dates
total_num_exp_dates = len(start_test_list)

coverage_rate_sweep = [0.01, 0.02, 0.05, 0.10]

cell_sampling = 15  #!!! need to find where to use this, for rhs

# Knox statistic parameters
#knox_space_bin_size = 100
#knox_space_bin_count = 5
#knox_space_bins = [(i*knox_space_bin_size,(i+1)*knox_space_bin_size) \
#                   for i in range(knox_space_bin_count)]
#print(knox_space_bins)
#knox_time_bin_size = 3
Esempio n. 6
0
def make_knox_info_file(
    datadir,
    in_csv_file_name,
    out_knox_file_name,
    geojson_file_name,
    crime_types,
    num_knox_iterations,
    knox_sbin_size,
    knox_sbin_num,
    knox_tbin_size,
    knox_tbin_num,
    earliest_exp_time,
    num_exp,
    time_step,
    time_len,
    csv_date_format="%m/%d/%Y %I:%M:%S %p",
    csv_longlat=False,
    csv_epsg=None,
    csv_infeet=True,
    csv_has_header=True,
):

    # Normalised and derived parameters

    # Normalised data directory
    datadir = os.path.expanduser(os.path.normpath(datadir))

    # Full paths to files
    in_csv_full_path = os.path.join(datadir, in_csv_file_name)
    geojson_full_path = os.path.join(datadir, geojson_file_name)

    # Set of relevant crime types in the data
    crime_type_set = set(splitCommaArgs(crime_types))

    # Spatial and temporal bandwidth bins
    knox_sbins = makeBins(knox_sbin_size, knox_sbin_num)
    knox_tbins = makeBins(knox_tbin_size, knox_tbin_num)

    earliest_start_time = generateEarlierDate(earliest_exp_time, time_len)
    print(f"First time window is from \
{earliest_start_time} to {earliest_exp_time}")
    start_times = generateDateRange(start=earliest_start_time,
                                    step=time_step,
                                    num=num_exp)

    out_file_path = os.path.join(datadir, out_knox_file_name)

    print(f"outfile: {out_file_path}")

    # Obtain crime data points, and region polygon

    # Obtain all crimes (of relevant types) from input data
    points_crime = loadGenericData(in_csv_full_path,
                                   crime_type_set=crime_type_set,
                                   date_format_csv=csv_date_format,
                                   longlat=csv_longlat,
                                   epsg=csv_epsg,
                                   infeet=csv_infeet,
                                   has_header=csv_has_header)

    # Obtain polygon from geojson file (which should have been pre-processed)
    region_polygon = gpd.read_file(geojson_full_path).unary_union

    # Get subset of input crime that occurred within region
    points_crime_region = open_cp.geometry.intersect_timed_points(
        points_crime, region_polygon)

    total_num_events = len(points_crime_region.timestamps)

    print(f"Successfully obtained data, with {total_num_events} events.")

    # Do Knox runs and store info in file

    print(f"Opening file {out_file_path} for writing.")
    with open(out_file_path, "w") as fout:

        chkpt_0 = time.time()
        for exp_index, start_time in enumerate(start_times):

            chkpt_1 = time.time()

            end_time = generateLaterDate(start_time, time_len)

            print(f"Time span: {start_time} to {end_time}")

            ### SELECT TRAINING DATA

            chkpt_2 = time.time()
            print(f"Getting data subset...")
            # Get subset of data for training
            points_crime_region_train = getTimedPointsInTimeRange(
                points_crime_region, start_time, end_time)
            print(f"...Got data subset. ({time.time()-chkpt_2:.4f})")

            num_events = len(points_crime_region_train.timestamps)

            print(f"Number of events in timespan: {num_events}")

            chkpt_3 = time.time()
            print("Calculating Knox...")
            knox_result = getKnoxResult(points_crime_region_train,
                                        num_knox_iterations, knox_sbins,
                                        knox_tbins)
            print(f"...Calculated Knox. ({time.time()-chkpt_3:.4f})")

            chkpt_4 = time.time()
            print(f"Writing to file {out_file_path} ...")
            fout.write(str(start_time))
            fout.write("\n")
            fout.write(str(end_time))
            fout.write("\n")
            fout.write(str(time_len))
            fout.write("\n")
            fout.write(str(num_events))
            fout.write("\n")
            fout.write("Spatial bins (columns):")
            fout.write("\n")
            fout.write(str(knox_sbins))
            fout.write("\n")
            fout.write("Temporal bins (rows):")
            fout.write("\n")
            fout.write(str(knox_tbins))
            fout.write("\n")
            fout.write("Knox Statistics\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(knox_result.statistic(j, i))
                    for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("Monte Carlo Medians\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(statistics.median(knox_result.distribution(j, i)))
                    for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("P Values\n")
            for i in range(knox_tbin_num):
                fout.write(" ".join([
                    str(knox_result.pvalue(j, i)) for j in range(knox_sbin_num)
                ]))
                fout.write("\n")
            fout.write("\n")
            print(f"...Wrote to file. ({time.time()-chkpt_4:.4f})")
            print(f"Time for this run: {time.time()-chkpt_1:.4f}")

    print(f"Number of runs: {len(start_times)}")
    print(f"Number of bins per run: {len(knox_sbins) * len(knox_tbins)}")
    print(f"Overall time: {time.time()-chkpt_0:.4f}")