def process(self, cardo_context: CardoContextBase, cardo_dataframe: CardoDataFrame, gt: CardoDataFrame) -> CardoDataFrame: dataframe = cardo_dataframe.dataframe ground_truth_dataframe = gt.dataframe true_positive_count, all_positives_count = self.__get_intersections( dataframe, ground_truth_dataframe) precision_value = self.__get_precision_value(true_positive_count, all_positives_count) if self.precision_column: cardo_dataframe.dataframe = dataframe.withColumn( self.precision_column, F.lit(precision_value)) source_name = self.source_name if self.source_name else cardo_dataframe.table_name cardo_context.logger.info( f"precision calculation for {source_name} -> " f"matches: {true_positive_count}, intersection: {all_positives_count}, " f"precision: {precision_value}, " f"is friendly: {self.friendly_precision}", extra={ "gt_match": true_positive_count, "id": f"{self.match_column}_for_{self.intersection_column}_{gt.table_name}", "log_type": self.log_type, "count": all_positives_count, "statistic_value": precision_value, "statistic_type": "precision", "table_name": source_name, "base_table": gt.table_name }) return cardo_dataframe
def inner(self: IStep, cardo_context: CardoContextBase, cardo_dataframe: CardoDataFrame) -> CardoDataFrame: df = cardo_dataframe.dataframe last_run = None if any([ table_name == table.name for table in Catalog( cardo_context.spark).listTables(schema_name) ]): last_run = cardo_context.spark.table(full_table_name) df = df.join(last_run, on=df.columns, how='left_anti') current_results = func( self, cardo_context, CardoDataFrame(df, cardo_dataframe.table_name)) if not last_run: HiveWriter(full_table_name).process(cardo_context, current_results) else: HiveWriter(full_table_name, mode='append').process(cardo_context, current_results) cardo_dataframe.dataframe = df.join( HiveReader(full_table_name).process(cardo_context).dataframe, on=df.columns) return cardo_dataframe
def test_table_with_multiple_partitions(self): dataset = CardoDataFrame( self.context.spark.createDataFrame([['a'], ['b']], 'col1: string')) dataset.dataframe = dataset.dataframe.repartition(10) acc_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest') acc_test.test(self.context, dataset).dataframe.collect() self.assertEqual( 2, sum(map(lambda record: record.pm_value, self.log_handler.records)))
def test_set_dataframe(self): first_dataset = self.context.spark.createDataFrame([['a']], 'column1: string') second_dataset = self.context.spark.createDataFrame([['aa']], 'column1: string') cardo_dataframe = CardoDataFrame(first_dataset, '6') cardo_dataframe.dataframe = second_dataset self.assertItemsEqual(second_dataset.collect(), cardo_dataframe.dataframe.collect()) self.assertItemsEqual(second_dataset.collect(), cardo_dataframe.rdd.collect())
def test_unpersist_df(self): # Arrange df = self.context.spark.createDataFrame([['a']], 'column1: string') second_df = self.context.spark.createDataFrame([['b']], 'column1: string') cardo_dataframe = CardoDataFrame(df, '') cardo_dataframe.persist() cardo_dataframe.dataframe = second_df # Act cardo_dataframe.unpersist() # Assert self.assertFalse(df.is_cached)