def test_make_counting_grid(self): """ Test counting grid for a base "well-known" scenario """ ## Get grid prediction, to use size and region params infile = open( '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl', 'rb') loaded_siedco = pickle.load(infile) infile.close() grid = loaded_siedco['prediction'].values[0] ## Select points to represent on counting matrix df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, _ = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_matrix = prediction_metrics.make_counting_grid( grid, timed_pts) self.assertEqual(counting_matrix.xoffset, 958645.8182116301) self.assertEqual(counting_matrix.yoffset, 904338.0678953262) self.assertEqual(counting_matrix.xsize, 150) self.assertEqual(counting_matrix.ysize, 150) self.assertEqual(counting_matrix._matrix.shape, (816, 343)) self.assertEqual(counting_matrix._matrix.max(), 357) self.assertEqual(counting_matrix._matrix.min(), 0)
def test_hit_rate_1(self): """ Test hit_rate=1 if all real events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date, date) timed_pts, region = ProcessData.get_time_space_points( df_filtered, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 1.0, 4: 1.0, 6: 1.0, 8: 1.0, 10: 1.0 }) self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
def run_single_validation(self, grid_size, validation_date, time_unit): data = ProcessData(self.dataset_info['name'], self.dataset_info['path']) df = data.get_formated_df() self.dataset_info['dict'] = data.dataset_dict #update dataset dictionary on experiment instance df_filtered = ProcessData.filter_by_field(df, self.custom_filter['field'], self.custom_filter['value']) validation = ValidateModel(df_filtered, self.dataset_info['dict'], time_unit, None) prediction_results = validation.inner_loop_validation(self.model, grid_size, self.train_dates, datetime.strptime(validation_date,'%Y-%m-%d'), self.metrics) return prediction_results
def inner_loop_validation(self, model_name, grid_size, train_subset_dates,current_validation_date): model_object = globals()[model_name]() df_train_subset = ProcessData.filter_by_date(self.df_train_validation, self.dataset_dict, train_subset_dates['initial'], train_subset_dates['final']) trained_model = model_object.train(df_train_subset, self.dataset_dict, grid_size, week_day= current_validation_date.strftime("%A"), region= self.region) print(len(trained_model.data.timestamps)) ## TODO: save trained_model? ### validation interval_duration = 6 ## TODO: set this var as a parameter validation_dates = {'initial':current_validation_date,'final':current_validation_date} df_validation = ProcessData.filter_by_date(self.df_train_validation, self.dataset_dict, validation_dates['initial'], validation_dates['final']) prediction_date = current_validation_date flag_array = True prediction_results = np.array([]) for interval_hour_start in range(0, 24, interval_duration): initial_prediction_datetime = prediction_date+timedelta(hours=interval_hour_start) final_prediction_datetime = initial_prediction_datetime+timedelta(hours=interval_duration) if df_validation.empty: #if no points (e.g. crimes) are reported on data interval eval_pts = [] else: validation_pts, _ = ProcessData.get_time_space_points(df_validation, self.dataset_dict) eval_pts = ValidateModel.select_timed_points(prediction_date, interval_hour_start, interval_duration, validation_pts) prediction_by_hour = ValidateModel.predict_on_interval(initial_prediction_datetime, interval_duration, model_object, trained_model) average_prediction = ValidateModel.interval_average_prediction(prediction_by_hour) element = np.array([initial_prediction_datetime, final_prediction_datetime, average_prediction, eval_pts]) flag_array, prediction_results = ProcessData.fill_array(flag_array, prediction_results, element) return prediction_results
def mse(grid_pred, real_events): """ Computes the "mean aquared error" between the prediction and the ground truth. Is computed as the sum of each cell squared difference. :param grid_pred: An instance of :class:`GridPrediction` matrix attribute must be normalized. :param real_events: An instance of :class: open_cp.data.TimedPoints :return: A non-negative floating point value """ grid_pred._matrix = ProcessData.normalize_matrix(grid_pred._matrix) counting_matrix = make_counting_grid(grid_pred, real_events) counting_matrix._matrix = ProcessData.normalize_matrix( counting_matrix._matrix) return mean_squared_error(grid_pred._matrix, counting_matrix._matrix)
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): df_train_subset['weekday'] = df_train_subset['TIME_STAMP'].dt.day_name( ) df_train_subset = ProcessData.filter_by_field(df_train_subset, 'weekday', kwargs['week_day']) train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size) trainer.data = train_pts trained_model = trainer.train(iterations=50, use_corrected=True) trained_model.data = train_pts return trained_model
def run_ncv_experiment(self, time_unit, grid_size, region): """ Run nested-cross validation :region: An instance of :class: open_cp.data.RectangularRegion, if 'None', the region will dfined based on training points :return: An array with prediction results """ self.check_exp_params() data = ProcessData(self.dataset_info['name'], self.dataset_info['path']) df = data.get_formated_df() self.dataset_info['dict'] = data.dataset_dict #update dataset dictionary on experiment instance dates_interval = {'initial': self.train_dates['initial'], 'final': self.validation_dates['final']} df_train_validation = ProcessData.select_data(df, self.dataset_info['dict'], self.custom_filter, dates_interval) validation = ValidateModel(df_train_validation, self.dataset_info['dict'], time_unit, region) prediction_results = validation.walk_fwd_chain(self.model, grid_size, self.train_dates, self.validation_dates, self.metrics) return prediction_results
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trained_model = naive.CountingGridKernel(grid_width=grid_size, region=train_region) trained_model.data = train_pts return trained_model
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size) trainer.data = train_pts trained_model = trainer.train(iterations=50, use_corrected=True) trained_model.data = train_pts return trained_model
def train(self, df_train_subset, dataset_dict, grid_size, **kwargs): train_pts, train_region = ProcessData.get_time_space_points( df_train_subset, dataset_dict) if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)): train_region = kwargs['region'] trained_model = kde.KDE(region=train_region, grid_size=grid_size) trained_model.time_kernel = kde.ExponentialTimeKernel(1) trained_model.space_kernel = kde.GaussianBaseProvider() trained_model.data = train_pts return trained_model
def test_filter_by_date_case2(self): #case 2: initial date out of available data, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2021-01-01' final_date = '2021-01-02' dataset_dict = self.my_data.dataset_dict self.assertWarns( UserWarning, lambda: ProcessData.filter_by_date( df, dataset_dict, initial_date, final_date))
def test_mse_1(self): """ Test mse=0 if both matrices (prediction and ground truth) are equal """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, region = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() mse = prediction_metrics.mse(grid_prediction, timed_pts) self.assertEqual(mse, 0)
def test_filter_by_date_case1(self): #case 1: date on interval, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA_HECHO'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected))
def test_mse_match(self): """ Test mse results match using different methods""" infile = open( '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl', 'rb') loaded_siedco = pickle.load(infile) infile.close() grid = loaded_siedco['prediction'].values[0] grid._matrix = ProcessData.normalize_matrix(grid._matrix) real_events = loaded_siedco['eval_pts'].values[0] mse_method_1 = prediction_metrics.mse(grid, real_events) counting_matrix = prediction_metrics.make_counting_grid( grid, real_events) counting_matrix._matrix = ProcessData.normalize_matrix( counting_matrix._matrix) mse_method_2 = np.sum((grid._matrix.astype("float") - counting_matrix._matrix.astype("float"))**2) mse_method_2 /= float(grid._matrix.shape[0] * grid._matrix.shape[1]) self.assertEqual(mse_method_1, mse_method_2)
def test_filter_by_date_case5(self): #case 5: date on interval, rnmc dataset = {'name': 'RNMC', 'path': ''} self.my_data.dataset_name = 'RNMC' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = '06. verify_enrich_rnmc_12022020.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected))
def test_hit_rate_2(self): """ Test hit_rate=0 if no events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_input = ProcessData.filter_by_date(df_input, dataset_dict, date, date) df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA') df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA') timed_pts, region = ProcessData.get_time_space_points( df_1, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] eval_pts, _ = ProcessData.get_time_space_points( df_2, data.dataset_dict) hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 0.0, 4: 0.0, 6: 0.0, 8: 0.0, 10: 0.0 }) self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
def test_filter_by_field_case4(self): #case 4: error, value doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'LOCALIDAD', 'NORMANDIA'))
def setUp(self): dataset_name = 'SIEDCO' head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'deduplicate_siedco_10032020.csv' dataset_path = head_path + file self.my_data = ProcessData(dataset_name, dataset_path)
def test_filter_by_field_case2(self): #case 2: filter successful, without field value df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, '', '') assertion_proxy = df_filtered.equals(df) self.assertEqual(assertion_proxy, True)
def test_normalize_matrix(self): matrix = np.arange(1, 5) matrix = matrix.reshape(2, 2) matrix_expected = np.array([[0.25, 0.5], [0.75, 1]]) matrix_normalized = ProcessData.normalize_matrix(matrix) self.assertTrue((matrix_normalized == matrix_expected).all())
def test_filter_by_field_case3(self): #case 3: error, field doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'nombre', 'Pedro'))
def test_filter_by_field_case1(self): #case 1: filter successful df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, 'LOCALIDAD', 'BOSA') self.assertEqual(df_filtered.LOCALIDAD.unique()[0], 'BOSA')
class TestCase(unittest.TestCase): def setUp(self): dataset_name = 'SIEDCO' head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'deduplicate_siedco_10032020.csv' dataset_path = head_path + file self.my_data = ProcessData(dataset_name, dataset_path) def test_set_up(self): self.assertEqual(self.my_data.dataset_name, 'SIEDCO') self.assertEqual( self.my_data.dataset_path, '/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_10032020.csv' ) self.assertEqual(self.my_data.dataset_dict, siedco_dict) def test_filter_by_date_case1(self): #case 1: date on interval, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA_HECHO'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case2(self): #case 2: initial date out of available data, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2021-01-01' final_date = '2021-01-02' dataset_dict = self.my_data.dataset_dict self.assertWarns( UserWarning, lambda: ProcessData.filter_by_date( df, dataset_dict, initial_date, final_date)) def test_filter_by_date_case3(self): #case 3: date on interval, nuse sample dataset = {'name': 'NUSE', 'path': ''} self.my_data.dataset_name = 'NUSE' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'verify_enrich_nuse_29112019.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case4(self): #case 4: date on interval, nuse full data dataset = {'name': 'NUSE', 'path': ''} self.my_data.dataset_name = 'NUSE' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'verify_enrich_nuse_29112019.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case5(self): #case 5: date on interval, rnmc dataset = {'name': 'RNMC', 'path': ''} self.my_data.dataset_name = 'RNMC' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = '06. verify_enrich_rnmc_12022020.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_field_case1(self): #case 1: filter successful df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, 'LOCALIDAD', 'BOSA') self.assertEqual(df_filtered.LOCALIDAD.unique()[0], 'BOSA') def test_filter_by_field_case2(self): #case 2: filter successful, without field value df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, '', '') assertion_proxy = df_filtered.equals(df) self.assertEqual(assertion_proxy, True) def test_filter_by_field_case3(self): #case 3: error, field doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'nombre', 'Pedro')) def test_filter_by_field_case4(self): #case 4: error, value doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'LOCALIDAD', 'NORMANDIA')) def test_normalize_matrix(self): matrix = np.arange(1, 5) matrix = matrix.reshape(2, 2) matrix_expected = np.array([[0.25, 0.5], [0.75, 1]]) matrix_normalized = ProcessData.normalize_matrix(matrix) self.assertTrue((matrix_normalized == matrix_expected).all())