Example #1
0
    def test_hit_rate_1(self):
        """ Test hit_rate=1 if all real events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                                 date)

        timed_pts, region = ProcessData.get_time_space_points(
            df_filtered, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 1.0,
            4: 1.0,
            6: 1.0,
            8: 1.0,
            10: 1.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
Example #2
0
    def test_make_counting_grid(self):
        """ Test counting grid for a base "well-known" scenario """
        ## Get grid prediction, to use size and region params
        infile = open(
            '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl',
            'rb')
        loaded_siedco = pickle.load(infile)
        infile.close()
        grid = loaded_siedco['prediction'].values[0]

        ## Select points to represent on counting matrix
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, _ = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_matrix = prediction_metrics.make_counting_grid(
            grid, timed_pts)
        self.assertEqual(counting_matrix.xoffset, 958645.8182116301)
        self.assertEqual(counting_matrix.yoffset, 904338.0678953262)
        self.assertEqual(counting_matrix.xsize, 150)
        self.assertEqual(counting_matrix.ysize, 150)
        self.assertEqual(counting_matrix._matrix.shape, (816, 343))
        self.assertEqual(counting_matrix._matrix.max(), 357)
        self.assertEqual(counting_matrix._matrix.min(), 0)
Example #3
0
 def run_single_validation(self, grid_size, validation_date, time_unit):
     data = ProcessData(self.dataset_info['name'], self.dataset_info['path'])
     df = data.get_formated_df()
     self.dataset_info['dict'] = data.dataset_dict #update dataset dictionary on experiment instance
     df_filtered = ProcessData.filter_by_field(df, self.custom_filter['field'], self.custom_filter['value'])
     validation = ValidateModel(df_filtered, self.dataset_info['dict'], time_unit, None)
     prediction_results = validation.inner_loop_validation(self.model, grid_size, self.train_dates, datetime.strptime(validation_date,'%Y-%m-%d'), self.metrics)
     return prediction_results
Example #4
0
    def run_ncv_experiment(self, time_unit, grid_size, region):
        """ Run nested-cross validation

        :region: An instance of :class: open_cp.data.RectangularRegion,
                 if 'None', the region will dfined based on training points

        :return: An array with prediction results
        """

        self.check_exp_params()
        data = ProcessData(self.dataset_info['name'], self.dataset_info['path'])
        df = data.get_formated_df()
        self.dataset_info['dict'] = data.dataset_dict #update dataset dictionary on experiment instance
        dates_interval = {'initial': self.train_dates['initial'], 'final': self.validation_dates['final']}
        df_train_validation = ProcessData.select_data(df, self.dataset_info['dict'], self.custom_filter, dates_interval)
        validation = ValidateModel(df_train_validation, self.dataset_info['dict'], time_unit, region)
        prediction_results = validation.walk_fwd_chain(self.model, grid_size, self.train_dates, self.validation_dates, self.metrics)
        return prediction_results
Example #5
0
    def test_mse_1(self):
        """ Test mse=0 if both matrices (prediction and ground truth) are equal """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, region = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()
        mse = prediction_metrics.mse(grid_prediction, timed_pts)
        self.assertEqual(mse, 0)
Example #6
0
    def test_hit_rate_2(self):
        """ Test hit_rate=0 if no events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_input = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                              date)
        df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA')
        df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA')

        timed_pts, region = ProcessData.get_time_space_points(
            df_1, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        eval_pts, _ = ProcessData.get_time_space_points(
            df_2, data.dataset_dict)
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 0.0,
            4: 0.0,
            6: 0.0,
            8: 0.0,
            10: 0.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
 def setUp(self):
     dataset_name = 'SIEDCO'
     head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/'
     file = 'deduplicate_siedco_10032020.csv'
     dataset_path = head_path + file
     self.my_data = ProcessData(dataset_name, dataset_path)