def test_hit_rate_1(self): """ Test hit_rate=1 if all real events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date, date) timed_pts, region = ProcessData.get_time_space_points( df_filtered, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 1.0, 4: 1.0, 6: 1.0, 8: 1.0, 10: 1.0 }) self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
def test_make_counting_grid(self): """ Test counting grid for a base "well-known" scenario """ ## Get grid prediction, to use size and region params infile = open( '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl', 'rb') loaded_siedco = pickle.load(infile) infile.close() grid = loaded_siedco['prediction'].values[0] ## Select points to represent on counting matrix df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, _ = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_matrix = prediction_metrics.make_counting_grid( grid, timed_pts) self.assertEqual(counting_matrix.xoffset, 958645.8182116301) self.assertEqual(counting_matrix.yoffset, 904338.0678953262) self.assertEqual(counting_matrix.xsize, 150) self.assertEqual(counting_matrix.ysize, 150) self.assertEqual(counting_matrix._matrix.shape, (816, 343)) self.assertEqual(counting_matrix._matrix.max(), 357) self.assertEqual(counting_matrix._matrix.min(), 0)
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trained_model = naive.CountingGridKernel(grid_width=grid_size, region=train_region) trained_model.data = train_pts return trained_model
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size) trainer.data = train_pts trained_model = trainer.train(iterations=50, use_corrected=True) trained_model.data = train_pts return trained_model
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trained_model = kde.KDE(region=train_region, grid_size=grid_size) trained_model.time_kernel = kde.ExponentialTimeKernel(1) trained_model.space_kernel = kde.GaussianBaseProvider() trained_model.data = train_pts return trained_model
def inner_loop_validation(self, model_name, grid_size, train_subset_dates,current_validation_date): model_object = globals()[model_name]() df_train_subset = ProcessData.filter_by_date(self.df_train_validation, self.dataset_dict, train_subset_dates['initial'], train_subset_dates['final']) trained_model = model_object.train(df_train_subset, self.dataset_dict, grid_size, week_day= current_validation_date.strftime("%A"), region= self.region) print(len(trained_model.data.timestamps)) ## TODO: save trained_model? ### validation interval_duration = 6 ## TODO: set this var as a parameter validation_dates = {'initial':current_validation_date,'final':current_validation_date} df_validation = ProcessData.filter_by_date(self.df_train_validation, self.dataset_dict, validation_dates['initial'], validation_dates['final']) prediction_date = current_validation_date flag_array = True prediction_results = np.array([]) for interval_hour_start in range(0, 24, interval_duration): initial_prediction_datetime = prediction_date+timedelta(hours=interval_hour_start) final_prediction_datetime = initial_prediction_datetime+timedelta(hours=interval_duration) if df_validation.empty: #if no points (e.g. crimes) are reported on data interval eval_pts = [] else: validation_pts, _ = ProcessData.get_time_space_points(df_validation, self.dataset_dict) eval_pts = ValidateModel.select_timed_points(prediction_date, interval_hour_start, interval_duration, validation_pts) prediction_by_hour = ValidateModel.predict_on_interval(initial_prediction_datetime, interval_duration, model_object, trained_model) average_prediction = ValidateModel.interval_average_prediction(prediction_by_hour) element = np.array([initial_prediction_datetime, final_prediction_datetime, average_prediction, eval_pts]) flag_array, prediction_results = ProcessData.fill_array(flag_array, prediction_results, element) return prediction_results
def test_hit_rate_2(self): """ Test hit_rate=0 if no events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_input = ProcessData.filter_by_date(df_input, dataset_dict, date, date) df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA') df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA') timed_pts, region = ProcessData.get_time_space_points( df_1, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] eval_pts, _ = ProcessData.get_time_space_points( df_2, data.dataset_dict) hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 0.0, 4: 0.0, 6: 0.0, 8: 0.0, 10: 0.0 }) self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): df_train_subset['weekday'] = df_train_subset['TIME_STAMP'].dt.day_name( ) df_train_subset = ProcessData.filter_by_field(df_train_subset, 'weekday', kwargs['week_day']) train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size) trainer.data = train_pts trained_model = trainer.train(iterations=50, use_corrected=True) trained_model.data = train_pts return trained_model
def test_mse_1(self): """ Test mse=0 if both matrices (prediction and ground truth) are equal """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, region = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() mse = prediction_metrics.mse(grid_prediction, timed_pts) self.assertEqual(mse, 0)