def validate(self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient) -> None: """Calculate dataframe rows to validate data into Feature Store. Args: feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. Raises: AssertionError: if count of written data doesn't match count in current feature set dataframe. """ table_name = ( os.path.join("historical", feature_set.entity, feature_set.name) if self.interval_mode and not self.debug_mode else (f"{self.database}.{feature_set.name}" if not self.debug_mode else f"historical_feature_store__{feature_set.name}")) written_count = (spark_client.read( self.db_config.format_, path=self.db_config.get_path_with_partitions( table_name, self._create_partitions(dataframe)), ).count() if self.interval_mode and not self.debug_mode else spark_client.read_table(table_name).count()) dataframe_count = dataframe.count() self._assert_validation_count(table_name, written_count, dataframe_count)
def check_existence(df: SparkDataFrame) -> bool: """Check given data frame has data. :param df: the data frame to be checked :return: True if data frame has data otherwise False """ return df.count() > 0
def print_data_info(data: DataFrame, file_name: str = '', isDetailed: bool = False): """ Prints spark i94project frame description :param isDetailed: :param file_name: :param data: spark i94project frame :return: none """ # if verbose_mode: print('----------------------------------------') print(f'\r| Data {file_name} info:') print('\r| Schema') data.printSchema() if isDetailed: print('\r| Types') print(data.dtypes) # print('\r| Describe') # print(data.describe().show()) print('\r| First rows') data.show(n=10) print('\r| Row count: {}'.format(data.count())) print('----------------------------------------') print('\n')
def join_park_violation_with_centerline(df_park_violation: DataFrame, df_centerline: DataFrame) -> DataFrame: """ Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number Basic steps: 1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data 2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data 3. Also other criteria was borocode and street name to join the data :param df_park_violation: :param df_centerline: :return: """ # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number") # df_centerline.cache() """below steps for even house number""" """below steps for odd house number""" df_park_violation.cache() df_centerline.cache() df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0) df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0) df_centerline.count() df_joined_1 = (df_park_violation_even.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))), ).select("total_cnt", "year", "PHYSICALID")) df_joined_2 = (df_park_violation_odd.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))), ).select("total_cnt", "year", "PHYSICALID")) """returing union of 2 dataframes""" return df_joined_1.unionAll(df_joined_2)
def assert_results(result: DataFrame) -> None: """ Shared asserts for the different formats of CSV file, all of which contain the same data. """ # Assert assert result.count() == 3 assert result.collect()[1][0] == "2" assert result.collect()[1][1] == "bar" assert result.collect()[1][2] == "bar2"
def spark_generate_medium_level_report( self, results_df: DataFrame) -> pd.DataFrame: columns = results_df.columns detector_results = results_df.rdd.flatMap( lambda row: self._get_detector_results(row, columns)).reduceByKey( lambda acc, next: acc + next).collect() report_detectors = self.__get_list_of_detectors(detector_results) num_rows = results_df.count() pd_columns = [] for column in columns: detection_stats = self.__get_detection_stats( column, report_detectors, detector_results, num_rows) pd_columns.append( pd.Series(data=detection_stats, index=report_detectors, name=column)) report_df = pd.concat(pd_columns, axis=1).fillna(0) return report_df
def test_all_same(spark: SparkSession, df1: DataFrame) -> None: """[Compare a dataframe against itself. Expect no differences] Args: spark (SparkSession): [Spark session] df1 (DataFrame): [A spark dataframe] """ dfResult = dfc.compareDfs( spark, df1, df1, tolerance=0.1, keysLeft="letters", keysRight="letters", colExcludeList=[], joinType="full_outer", ) pass_count = dfResult.filter("PASS == True").count() overall_count = dfResult.count() assert pass_count == overall_count assert df1.count() == overall_count
def validate(self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient) -> None: """Calculate dataframe rows to validate data into Feature Store. Args: feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. Raises: AssertionError: if count of written data doesn't match count in current feature set dataframe. """ table_name = (f"{self.database}.{feature_set.name}" if not self.debug_mode else f"historical_feature_store__{feature_set.name}") written_count = spark_client.read_table(table_name).count() dataframe_count = dataframe.count() self._assert_validation_count(table_name, written_count, dataframe_count)
def count_rows(df: DataFrame, verbose=True): """ Counts the rows of a Spark Dataframe and prints out the result and execution time. :param df: :param verbose: :return: """ if verbose: print(f'Counting the rows.') # Save start time for timing. start_time = time.time() number_of_records = df.count() print(f'\tNumber of records: {number_of_records}.') if verbose: print(f'\tExecution time: {time.time() - start_time:.5f} s.') return number_of_records
def read_silver_loans_tbl_defaults(df: DataFrame, logger: Logger): logger.info(df.count()) return df
def createContent(df:DataFrame) -> dict: content = {} content['count'] = df.count() content['schema'] = json.loads(df.schema.json()) return json.dumps(content, sort_keys=False, indent=4, default=str)