Python sql.dataframe.DataFrame.count Examples

Programming Language: Python

Namespace/Package Name: pyspark

Method/Function: count

Examples at hotexamples.com: 5

Python sql.dataframe.DataFrame.count - 5 examples found. These are the top rated real world Python examples of pyspark.sql.dataframe.DataFrame.count extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

withColumn(10)

select(9)

join(8)

filter(6)

count(5)

withColumnRenamed(4)

toPandas(3)

agg(2)

groupBy(2)

createOrReplaceTempView(2)

groupby(2)

cache(1)

limit(1)

query(1)

repartition(1)

fillna(1)

drop(1)

Example #1

Show file

File: autosegmentation.py Project: whylabs/whylogs

def _simple_entropy(df: pyspark.sql.dataframe.DataFrame, column_name: str) -> float:
    count = df.count()
    testdf = df.select(column_name).groupby(column_name).agg((F.count(column_name) / count).alias("p"))
    result = testdf.groupby().agg(-F.sum(F.col("p") * F.log2("p"))).collect()[0][0]
    if not result:
        return 0.0
    return result

Example #2

Show file

File: autosegmentation.py Project: whylabs/whylogs

def _find_best_split(
    countdf: pyspark.sql.dataframe.DataFrame,
    prev_split_columns: List[str],
    valid_column_names: List[str],
    target_column_name: str,
    normalization: Optional[Dict[str, int]] = None,
) -> Tuple[float, str]:
    total_count = countdf.count()

    max_score_tuple = 0.0, None
    pre_split_entropy = _weighted_entropy(countdf, total_count, prev_split_columns, target_column_name, True)

    for column_name in valid_column_names:
        if column_name == target_column_name:
            continue
        new_split_columns = prev_split_columns[:]
        new_split_columns.append(column_name)
        post_split_entropy = _weighted_entropy(countdf, total_count, new_split_columns, target_column_name, True)
        value = pre_split_entropy - post_split_entropy

        if normalization and normalization[column_name] > 0:
            value /= math.log(normalization[column_name])

        if value > max_score_tuple[0]:
            max_score_tuple = value, column_name

    return max_score_tuple

Example #3

Show file

File: cltv_spark_and_hive_funcs.py Project: Romanananan/some_code

def get_repartition_value(sdf: pyspark.sql.dataframe.DataFrame,
                          target_size: int = 245,
                          compression: str = 'none') -> int:
    lenght = sdf.count()
    df_1_row = sdf.limit(int(1e4))
    tmp_file_name = 'test_file'
    while check_hdfs_file_ex(tmp_file_name):
        tmp_file_name += '_'
    df_1_row.coalesce(1).write.option('compression', compression)\
        .mode('overwrite').parquet(tmp_file_name)
    row_byte_weight = int(sh.hdfs('dfs', '-du', tmp_file_name)\
        .stdout.decode('utf-8').split('\n')[-2].split(' ')[0])
    sh.hdfs('dfs', '-rm', '-R', '-skipTrash', tmp_file_name)
    nd_rep_val = int(row_byte_weight * lenght / target_size / (1024 * 1024) /
                     1e4)
    return 1 if nd_rep_val < 1 else nd_rep_val

Example #4

Show file

File: autosegmentation.py Project: whylabs/whylogs

def estimate_segments(
    df: pyspark.sql.dataframe.DataFrame,
    target_field: str = None,
    max_segments: int = 30,
    include_columns: List[str] = [],
    unique_perc_bounds: Tuple[float, float] = [None, 0.8],
    null_perc_bounds: Tuple[float, float] = [None, 0.2],
) -> Optional[Union[List[Dict], List[str]]]:
    """
    Estimates the most important features and values on which to segment
    data profiling using entropy-based methods.

    If no target column provided, maximum entropy column is substituted.

    :param df: the dataframe of data to profile
    :param target_field: target field (optional)
    :param max_segments: upper threshold for total combinations of segments,
    default 30
    :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance.
    :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive.
    :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive.
    :return: a list of segmentation feature names
    """
    current_split_columns = []
    segments = []
    segments_used = 1
    max_entropy_column = (float("-inf"), None)

    if not unique_perc_bounds[0]:
        unique_perc_bounds[0] = float("-inf")
    if not unique_perc_bounds[1]:
        unique_perc_bounds[1] = float("inf")
    if not null_perc_bounds[0]:
        null_perc_bounds[0] = float("-inf")
    if not null_perc_bounds[1]:
        null_perc_bounds[1] = float("inf")

    valid_column_names = set()

    count = df.count()

    print("Limiting to categorical (string) data columns...")
    valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)}

    print("Gathering cardinality information...")
    n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names}
    print("Gathering missing value information...")
    n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names}

    print("Finding valid columns for autosegmentation...")
    for col in valid_column_names.copy():
        null_perc = 0.0 if count == 0 else n_nulls[col] / count
        unique_perc = 0.0 if count == 0 else n_uniques[col] / count
        if (
            col in segments
            or n_uniques[col] <= 1
            or null_perc < null_perc_bounds[0]
            or null_perc >= null_perc_bounds[1]
            or unique_perc < unique_perc_bounds[0]
            or unique_perc >= unique_perc_bounds[1]
        ):
            valid_column_names.remove(col)

    if not valid_column_names:
        return []

    if not target_field:
        print("Finding alternative target field since none were specified...")
        for col in valid_column_names:
            col_entropy = _simple_entropy(df, col)
            if n_uniques[col] > 1:
                col_entropy /= math.log(n_uniques[col])
            if col_entropy > max_entropy_column[0]:
                max_entropy_column = (col_entropy, col)
        target_field = max_entropy_column[1]

    print(f"Using {target_field} column as target field.")
    assert target_field in df.columns
    valid_column_names.add(target_field)
    valid_column_names = list(valid_column_names)

    countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache()

    print("Calculating segments...")
    while segments_used < max_segments:
        valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))}
        _, segment_column_name = _find_best_split(
            countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques
        )

        if not segment_column_name:
            break

        segments.append(segment_column_name)
        current_split_columns.append(segment_column_name)
        segments_used *= n_uniques[segment_column_name]

    return segments

Example #5

Show file

File: Rank_analysis_helperfunction.py Project: YiranJing/Lagardere_CommercialAnalysis

def test_calculate_material_change(
        merge_data: pyspark.sql.dataframe.DataFrame,
        last_week_sale: pyspark.sql.dataframe.DataFrame):
    assert merge_data.count() == last_week_sale.count(), 'we want ' + str(last_week_sale.count()) + \
    ". But we get "+str(merge_data.count())# test joined dataset