def validate(self, feature_set: FeatureSet, dataframe: DataFrame,
                 spark_client: SparkClient) -> None:
        """Calculate dataframe rows to validate data into Feature Store.

        Args:
            feature_set: object processed with feature_set informations.
            dataframe: spark dataframe containing data from a feature set.
            spark_client: client for spark connections with external services.

        Raises:
            AssertionError: if count of written data doesn't match count in current
                feature set dataframe.
        """
        table_name = (
            os.path.join("historical", feature_set.entity, feature_set.name)
            if self.interval_mode and not self.debug_mode else
            (f"{self.database}.{feature_set.name}" if not self.debug_mode else
             f"historical_feature_store__{feature_set.name}"))

        written_count = (spark_client.read(
            self.db_config.format_,
            path=self.db_config.get_path_with_partitions(
                table_name, self._create_partitions(dataframe)),
        ).count() if self.interval_mode and not self.debug_mode else
                         spark_client.read_table(table_name).count())

        dataframe_count = dataframe.count()

        self._assert_validation_count(table_name, written_count,
                                      dataframe_count)
def check_existence(df: SparkDataFrame) -> bool:
    """Check given data frame has data.

    :param df: the data frame to be checked
    :return: True if data frame has data otherwise False
    """
    return df.count() > 0
Esempio n. 3
0
def print_data_info(data: DataFrame,
                    file_name: str = '',
                    isDetailed: bool = False):
    """
        Prints spark i94project frame description

        :param isDetailed:
        :param file_name:
        :param data: spark i94project frame
        :return: none
    """
    # if verbose_mode:
    print('----------------------------------------')
    print(f'\r| Data {file_name} info:')
    print('\r| Schema')
    data.printSchema()
    if isDetailed:
        print('\r| Types')
        print(data.dtypes)
        # print('\r| Describe')
        # print(data.describe().show())
        print('\r| First rows')
        data.show(n=10)
        print('\r| Row count: {}'.format(data.count()))

    print('----------------------------------------')
    print('\n')
Esempio n. 4
0
def join_park_violation_with_centerline(df_park_violation: DataFrame,
                                        df_centerline: DataFrame) -> DataFrame:
    """
    Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number

    Basic steps:
    1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data
    2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data
    3. Also other criteria was borocode and street name to join the data

    :param df_park_violation:
    :param df_centerline:
    :return:
    """
    # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number")
    # df_centerline.cache()
    """below steps for even house number"""
    """below steps for odd house number"""
    df_park_violation.cache()
    df_centerline.cache()

    df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0)
    df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0)
    df_centerline.count()

    df_joined_1 = (df_park_violation_even.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))

    df_joined_2 = (df_park_violation_odd.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))
    """returing union of 2 dataframes"""
    return df_joined_1.unionAll(df_joined_2)
Esempio n. 5
0
def assert_results(result: DataFrame) -> None:
    """
    Shared asserts for the different formats of CSV file, all of which contain the same data.
    """
    # Assert
    assert result.count() == 3

    assert result.collect()[1][0] == "2"
    assert result.collect()[1][1] == "bar"
    assert result.collect()[1][2] == "bar2"
Esempio n. 6
0
 def spark_generate_medium_level_report(
         self, results_df: DataFrame) -> pd.DataFrame:
     columns = results_df.columns
     detector_results = results_df.rdd.flatMap(
         lambda row: self._get_detector_results(row, columns)).reduceByKey(
             lambda acc, next: acc + next).collect()
     report_detectors = self.__get_list_of_detectors(detector_results)
     num_rows = results_df.count()
     pd_columns = []
     for column in columns:
         detection_stats = self.__get_detection_stats(
             column, report_detectors, detector_results, num_rows)
         pd_columns.append(
             pd.Series(data=detection_stats,
                       index=report_detectors,
                       name=column))
     report_df = pd.concat(pd_columns, axis=1).fillna(0)
     return report_df
Esempio n. 7
0
def test_all_same(spark: SparkSession, df1: DataFrame) -> None:
    """[Compare a dataframe against itself. Expect no differences]

    Args:
        spark (SparkSession): [Spark session]
        df1 (DataFrame): [A spark dataframe]
    """
    dfResult = dfc.compareDfs(
        spark,
        df1,
        df1,
        tolerance=0.1,
        keysLeft="letters",
        keysRight="letters",
        colExcludeList=[],
        joinType="full_outer",
    )
    pass_count = dfResult.filter("PASS == True").count()
    overall_count = dfResult.count()
    assert pass_count == overall_count
    assert df1.count() == overall_count
    def validate(self, feature_set: FeatureSet, dataframe: DataFrame,
                 spark_client: SparkClient) -> None:
        """Calculate dataframe rows to validate data into Feature Store.

        Args:
            feature_set: object processed with feature_set informations.
            dataframe: spark dataframe containing data from a feature set.
            spark_client: client for spark connections with external services.

        Raises:
            AssertionError: if count of written data doesn't match count in current
                feature set dataframe.

        """
        table_name = (f"{self.database}.{feature_set.name}"
                      if not self.debug_mode else
                      f"historical_feature_store__{feature_set.name}")
        written_count = spark_client.read_table(table_name).count()
        dataframe_count = dataframe.count()
        self._assert_validation_count(table_name, written_count,
                                      dataframe_count)
Esempio n. 9
0
def count_rows(df: DataFrame, verbose=True):
    """
    Counts the rows of a Spark Dataframe and prints out the result and
    execution time.

    :param df:
    :param verbose:
    :return:
    """

    if verbose:
        print(f'Counting the rows.')

    # Save start time for timing.
    start_time = time.time()

    number_of_records = df.count()
    print(f'\tNumber of records: {number_of_records}.')

    if verbose:
        print(f'\tExecution time: {time.time() - start_time:.5f} s.')

    return number_of_records
def read_silver_loans_tbl_defaults(df: DataFrame, logger: Logger):
    logger.info(df.count())
    return df
 def createContent(df:DataFrame) -> dict:
     content = {}
     content['count'] = df.count() 
     content['schema'] = json.loads(df.schema.json())
     return json.dumps(content, sort_keys=False, indent=4, default=str)