Beispiel #1
0
    def test_repartition(self):
        mock_dataframe = self._generate_mock_dataframe()
        mock_dataframe.repartition = Mock()
        a_dataset = SparkDataset("my dataset", mock_dataframe)

        a_dataset.repartition(42)
        mock_dataframe.repartition.assert_called_with(42)
Beispiel #2
0
 def test_count(self):
     mock_dataframe = self._generate_mock_dataframe()
     mock_dataframe.count = Mock(return_value=42)
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     count = a_dataset.count()
     self.assertEqual(42, count)
     mock_dataframe.count.assert_called_once()
Beispiel #3
0
    def test_show(self):
        mock_dataframe = self._generate_mock_dataframe()
        mock_dataframe.show = Mock()
        a_dataset = SparkDataset("my dataset", mock_dataframe)
        a_dataset.show()

        mock_dataframe.show.assert_called_once()
Beispiel #4
0
    def test_select_non_arrow_type(self):
        mock_dataframe = self._generate_mock_dataframe()
        mock_dataframe2 = self._generate_mock_dataframe()
        mock_dataframe2.dtypes = [('Muon_tightId', 'array<boolean>'),
                                  ('event', 'string'),
                                  ('luminosityBlock', 'string'),
                                  ("dataset", "string"), ('run', 'string')]
        mock_dataframe.columns = [
            'Muon_tightId', 'event', 'luminosityBlock', "dataset", 'run'
        ]

        # Mock dataframe subscripting to get columns
        muon_tight_id_mock = MagicMock()
        mock_dataframe2.__getitem__.side_effect = [
            muon_tight_id_mock, 'dataset', 'event', 'luminosityBlock', 'run'
        ]

        mock_dataframe.select = Mock(return_value=mock_dataframe2)

        a_dataset = SparkDataset("my dataset", mock_dataframe)

        a_dataset2 = a_dataset.select_columns(
            ["dataset", "run", "luminosityBlock", "event", "Muon_tightId"])

        self.assertEqual(mock_dataframe2, a_dataset2.dataframe)
        self.assertEqual("my dataset", a_dataset2.name)

        muon_tight_id_mock.cast.assert_called_with("array<int >")
Beispiel #5
0
 def test_udf_arguments(self):
     mock_dataframe = self._generate_mock_dataframe()
     mock_dataframe.columns = [
         "dataset", "run", "luminosityBlock", "event", "nElectrons",
         "Electron_pt", "Electron_eta", "nMuons", "Muon_pt", "Muon_eta"
     ]
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     result = a_dataset.udf_arguments(["Electron"])
     self.assertEqual(
         ['dataset', 'nElectrons', 'Electron_pt', 'Electron_eta'], result)
Beispiel #6
0
 def test_columns_for_physics_objects(self):
     mock_dataframe = self._generate_mock_dataframe()
     mock_dataframe.columns = [
         "dataset", "run", "luminosityBlock", "event", "nElectrons",
         "Electron_pt", "Electron_eta", "nMuons", "Muon_pt", "Muon_eta"
     ]
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     rslt = a_dataset.columns_for_physics_objects(["Electron", "Muon"])
     self.assertEqual(rslt, [
         'nElectrons', 'Electron_pt', 'Electron_eta', 'nMuons', 'Muon_pt',
         'Muon_eta'
     ])
    def read_files(self, dataset_name, files):
        result_df = None
        # Sparkroot can't handle list of files
        for file in files:
            file_df = self.spark.read.format("org.dianahep.sparkroot") \
                .option("tree", "Events") \
                .load(file)

            # So just append each file's datafrane into one big one
            result_df = file_df if not result_df else result_df.union(file_df)

        dataset = SparkDataset(dataset_name, result_df)
        dataset.repartition(self.num_partitions)

        return dataset
Beispiel #8
0
    def test_select_provide_technical_fields(self):
        mock_dataframe = self._generate_mock_dataframe()
        mock_dataframe2 = self._generate_mock_dataframe()
        mock_dataframe.select = Mock(return_value=mock_dataframe2)

        a_dataset = SparkDataset("my dataset", mock_dataframe)

        a_dataset2 = a_dataset.select_columns(
            ["dataset", "run", "luminosityBlock", "event", "Electron_pt"])

        self.assertEqual(mock_dataframe2, a_dataset2.dataframe)
        self.assertEqual("my dataset", a_dataset2.name)

        # The actual order of the selected columns is hard to predict. Use
        # sorted column names to test
        call_args = mock_dataframe.select.call_args[0][0]
        self.assertEqual(
            sorted(
                ["dataset", "run", "luminosityBlock", "event", "Electron_pt"]),
            sorted(call_args))
Beispiel #9
0
 def test_constuctor(self):
     # Mocking with the spark sql lit function is a bit tricky. We patch
     # _active_spark_context to handle the interactions with the jvm
     mock_lit = Mock()
     with patch('pyspark.SparkContext._active_spark_context', new=mock_lit):
         mock_dataframe = self._generate_mock_dataframe()
         mock_dataframe.columns = ['run', 'event']
         mock_dataframe.withColumn = Mock(return_value=mock_dataframe)
         a_dataset = SparkDataset("my dataset", mock_dataframe)
         self.assertEqual(a_dataset.name, "my dataset")
         self.assertEqual(a_dataset.dataframe, mock_dataframe)
Beispiel #10
0
    def test_execute_udf(self):
        mock_udf_handle = Mock()
        with patch('irishep.datasets.spark_dataset.pandas_udf',
                   return_value=mock_udf_handle) as mock_udf:
            mock_dataframe = self._generate_mock_dataframe()
            mock_dataframe.columns = [
                'dataset', "Electron_pdgId", "Electron_pfRelIso03_all",
                "nMuon", "Muon_pt", "Muon_eta"
            ]

            mock_dataframe.select = Mock()
            user_func = Mock(UserDefinedFunction)
            user_func.physics_objects = ["Electron", "Muon"]
            user_func.function = Mock()
            a_dataset = SparkDataset("my dataset", mock_dataframe)

            a_dataset.execute_udf(user_func)

            mock_udf.assert_called_with(user_func.function, DoubleType(),
                                        PandasUDFType.SCALAR)
            mock_udf_handle.assert_called_with('dataset', 'Electron_pdgId',
                                               'Electron_pfRelIso03_all',
                                               'nMuon', 'Muon_pt', 'Muon_eta')
Beispiel #11
0
 def test_count_column_for_physics_object(self):
     mock_dataframe = self._generate_mock_dataframe()
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     self.assertEqual("nElectron",
                      a_dataset.count_column_for_physics_object("Electron"))
Beispiel #12
0
 def test_columns_with_types(self):
     mock_dataframe = self._generate_mock_dataframe()
     mock_dataframe.dtypes = [('a', 'int'), ('b', 'string')]
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     cols = a_dataset.columns_with_types
     self.assertEqual(cols, [('a', 'int'), ('b', 'string')])
Beispiel #13
0
 def test_columns(self):
     mock_dataframe = self._generate_mock_dataframe()
     mock_dataframe.columns = ['dataset', 'a', 'b', 'c']
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     cols = a_dataset.columns
     self.assertEqual(cols, ['dataset', 'a', 'b', 'c'])
Beispiel #14
0
 def test_constuctor_with_dataset_name_in_dataframe(self):
     mock_dataframe = self._generate_mock_dataframe()
     a_dataset = SparkDataset("my dataset", mock_dataframe)
     self.assertEqual(a_dataset.name, "my dataset")
     self.assertEqual(a_dataset.dataframe, mock_dataframe)