Ejemplo n.º 1
0
    def test_hit_rate_1(self):
        """ Test hit_rate=1 if all real events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                                 date)

        timed_pts, region = ProcessData.get_time_space_points(
            df_filtered, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 1.0,
            4: 1.0,
            6: 1.0,
            8: 1.0,
            10: 1.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
Ejemplo n.º 2
0
    def test_make_counting_grid(self):
        """ Test counting grid for a base "well-known" scenario """
        ## Get grid prediction, to use size and region params
        infile = open(
            '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl',
            'rb')
        loaded_siedco = pickle.load(infile)
        infile.close()
        grid = loaded_siedco['prediction'].values[0]

        ## Select points to represent on counting matrix
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, _ = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_matrix = prediction_metrics.make_counting_grid(
            grid, timed_pts)
        self.assertEqual(counting_matrix.xoffset, 958645.8182116301)
        self.assertEqual(counting_matrix.yoffset, 904338.0678953262)
        self.assertEqual(counting_matrix.xsize, 150)
        self.assertEqual(counting_matrix.ysize, 150)
        self.assertEqual(counting_matrix._matrix.shape, (816, 343))
        self.assertEqual(counting_matrix._matrix.max(), 357)
        self.assertEqual(counting_matrix._matrix.min(), 0)
Ejemplo n.º 3
0
 def train(self, df_train_subset, dataset_dict, grid_size, **kwargs):
     train_pts, train_region = ProcessData.get_time_space_points(
         df_train_subset, dataset_dict)
     if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)):
         train_region = kwargs['region']
     trained_model = naive.CountingGridKernel(grid_width=grid_size,
                                              region=train_region)
     trained_model.data = train_pts
     return trained_model
Ejemplo n.º 4
0
 def train(self, df_train_subset, dataset_dict, grid_size, **kwargs):
     train_pts, train_region = ProcessData.get_time_space_points(
         df_train_subset, dataset_dict)
     if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)):
         train_region = kwargs['region']
     trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size)
     trainer.data = train_pts
     trained_model = trainer.train(iterations=50, use_corrected=True)
     trained_model.data = train_pts
     return trained_model
Ejemplo n.º 5
0
 def train(self, df_train_subset, dataset_dict, grid_size, **kwargs):
     train_pts, train_region = ProcessData.get_time_space_points(
         df_train_subset, dataset_dict)
     if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)):
         train_region = kwargs['region']
     trained_model = kde.KDE(region=train_region, grid_size=grid_size)
     trained_model.time_kernel = kde.ExponentialTimeKernel(1)
     trained_model.space_kernel = kde.GaussianBaseProvider()
     trained_model.data = train_pts
     return trained_model
Ejemplo n.º 6
0
    def inner_loop_validation(self, model_name, grid_size, train_subset_dates,current_validation_date):

        model_object = globals()[model_name]()
        df_train_subset = ProcessData.filter_by_date(self.df_train_validation,
                                                     self.dataset_dict,
                                                     train_subset_dates['initial'],
                                                     train_subset_dates['final'])
        trained_model = model_object.train(df_train_subset, self.dataset_dict,
                                            grid_size,
                                            week_day= current_validation_date.strftime("%A"),
                                            region= self.region)
        print(len(trained_model.data.timestamps))
        ## TODO: save trained_model?

        ### validation
        interval_duration = 6 ## TODO: set this var as a parameter
        validation_dates = {'initial':current_validation_date,'final':current_validation_date}
        df_validation = ProcessData.filter_by_date(self.df_train_validation,
                                                   self.dataset_dict,
                                                   validation_dates['initial'],
                                                   validation_dates['final'])
        prediction_date = current_validation_date
        flag_array = True
        prediction_results = np.array([])

        for interval_hour_start in range(0, 24, interval_duration):
            initial_prediction_datetime = prediction_date+timedelta(hours=interval_hour_start)
            final_prediction_datetime = initial_prediction_datetime+timedelta(hours=interval_duration)
            if df_validation.empty: #if no points (e.g. crimes) are reported on data interval
                eval_pts = []
            else:
                validation_pts, _ = ProcessData.get_time_space_points(df_validation,
                                                                      self.dataset_dict)
                eval_pts = ValidateModel.select_timed_points(prediction_date,
                                                             interval_hour_start,
                                                             interval_duration,
                                                             validation_pts)

            prediction_by_hour = ValidateModel.predict_on_interval(initial_prediction_datetime,
                                                                   interval_duration,
                                                                   model_object,
                                                                   trained_model)
            average_prediction = ValidateModel.interval_average_prediction(prediction_by_hour)

            element = np.array([initial_prediction_datetime,
                                final_prediction_datetime,
                                average_prediction,
                                eval_pts])
            flag_array, prediction_results = ProcessData.fill_array(flag_array,
                                                                    prediction_results,
                                                                    element)
        return prediction_results
Ejemplo n.º 7
0
    def test_hit_rate_2(self):
        """ Test hit_rate=0 if no events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_input = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                              date)
        df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA')
        df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA')

        timed_pts, region = ProcessData.get_time_space_points(
            df_1, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        eval_pts, _ = ProcessData.get_time_space_points(
            df_2, data.dataset_dict)
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 0.0,
            4: 0.0,
            6: 0.0,
            8: 0.0,
            10: 0.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
Ejemplo n.º 8
0
    def train(self, df_train_subset, dataset_dict, grid_size, **kwargs):
        df_train_subset['weekday'] = df_train_subset['TIME_STAMP'].dt.day_name(
        )
        df_train_subset = ProcessData.filter_by_field(df_train_subset,
                                                      'weekday',
                                                      kwargs['week_day'])

        train_pts, train_region = ProcessData.get_time_space_points(
            df_train_subset, dataset_dict)
        if (isinstance(kwargs['region'], open_cp.data.RectangularRegion)):
            train_region = kwargs['region']
        trainer = seppexp.SEPPTrainer(region=train_region, grid_size=grid_size)
        trainer.data = train_pts
        trained_model = trainer.train(iterations=50, use_corrected=True)
        trained_model.data = train_pts
        return trained_model
Ejemplo n.º 9
0
    def test_mse_1(self):
        """ Test mse=0 if both matrices (prediction and ground truth) are equal """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, region = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()
        mse = prediction_metrics.mse(grid_prediction, timed_pts)
        self.assertEqual(mse, 0)