Esempio n. 1
0
 def process(self, cardo_context: CardoContextBase,
             cardo_dataframe: CardoDataFrame,
             gt: CardoDataFrame) -> CardoDataFrame:
     dataframe = cardo_dataframe.dataframe
     ground_truth_dataframe = gt.dataframe
     true_positive_count, all_positives_count = self.__get_intersections(
         dataframe, ground_truth_dataframe)
     precision_value = self.__get_precision_value(true_positive_count,
                                                  all_positives_count)
     if self.precision_column:
         cardo_dataframe.dataframe = dataframe.withColumn(
             self.precision_column, F.lit(precision_value))
     source_name = self.source_name if self.source_name else cardo_dataframe.table_name
     cardo_context.logger.info(
         f"precision calculation for {source_name} -> "
         f"matches: {true_positive_count}, intersection: {all_positives_count}, "
         f"precision: {precision_value}, "
         f"is friendly: {self.friendly_precision}",
         extra={
             "gt_match": true_positive_count,
             "id":
             f"{self.match_column}_for_{self.intersection_column}_{gt.table_name}",
             "log_type": self.log_type,
             "count": all_positives_count,
             "statistic_value": precision_value,
             "statistic_type": "precision",
             "table_name": source_name,
             "base_table": gt.table_name
         })
     return cardo_dataframe
Esempio n. 2
0
        def inner(self: IStep, cardo_context: CardoContextBase,
                  cardo_dataframe: CardoDataFrame) -> CardoDataFrame:
            df = cardo_dataframe.dataframe
            last_run = None
            if any([
                    table_name == table.name for table in Catalog(
                        cardo_context.spark).listTables(schema_name)
            ]):
                last_run = cardo_context.spark.table(full_table_name)
                df = df.join(last_run, on=df.columns, how='left_anti')

            current_results = func(
                self, cardo_context,
                CardoDataFrame(df, cardo_dataframe.table_name))

            if not last_run:
                HiveWriter(full_table_name).process(cardo_context,
                                                    current_results)
            else:
                HiveWriter(full_table_name,
                           mode='append').process(cardo_context,
                                                  current_results)

            cardo_dataframe.dataframe = df.join(
                HiveReader(full_table_name).process(cardo_context).dataframe,
                on=df.columns)
            return cardo_dataframe
Esempio n. 3
0
 def test_table_with_multiple_partitions(self):
     dataset = CardoDataFrame(
         self.context.spark.createDataFrame([['a'], ['b']], 'col1: string'))
     dataset.dataframe = dataset.dataframe.repartition(10)
     acc_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest')
     acc_test.test(self.context, dataset).dataframe.collect()
     self.assertEqual(
         2,
         sum(map(lambda record: record.pm_value, self.log_handler.records)))
Esempio n. 4
0
 def test_set_dataframe(self):
     first_dataset = self.context.spark.createDataFrame([['a']],
                                                        'column1: string')
     second_dataset = self.context.spark.createDataFrame([['aa']],
                                                         'column1: string')
     cardo_dataframe = CardoDataFrame(first_dataset, '6')
     cardo_dataframe.dataframe = second_dataset
     self.assertItemsEqual(second_dataset.collect(),
                           cardo_dataframe.dataframe.collect())
     self.assertItemsEqual(second_dataset.collect(),
                           cardo_dataframe.rdd.collect())
Esempio n. 5
0
    def test_unpersist_df(self):
        # Arrange
        df = self.context.spark.createDataFrame([['a']], 'column1: string')
        second_df = self.context.spark.createDataFrame([['b']],
                                                       'column1: string')
        cardo_dataframe = CardoDataFrame(df, '')
        cardo_dataframe.persist()
        cardo_dataframe.dataframe = second_df

        # Act
        cardo_dataframe.unpersist()

        # Assert
        self.assertFalse(df.is_cached)