Beispiel #1
0
    def repartitionDF(self, df: DataFrame, partitions: int = 0):
        '''
            Repartition the inuput dataframe

            parms: df          -> dataframe
                   partitions  -> new partitions count. Defaulted to 0 i.e Don't partition

            logic,
                if partitions = 0 , Don't repartitions
                if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2)
                if partitions > 0 , Repartition/coalesce to the input number
        '''
        curParts = df.rdd.getNumPartitions
        finalParts = min(curParts, partitions)

        if curParts == partitions or partitions == 0:
            finalParts = -1
        elif partitions == -1:
            finalParts = self.__dfltRDDParts
        elif partitions > 0:
            finalParts = partitions
        else:
            pass  #finalParts is pre-populated.

        self.log("Current Partitions: %d , Requested: %d,  Final: %d " %
                 (curParts, partitions, finalParts))

        if finalParts != -1:
            return df
        elif curParts > finalParts:
            return df.coalesce(finalParts)
        else:
            return df.repartition(finalParts)
Beispiel #2
0
def aggregate_dataset_by_year(joined_df: DataFrame) -> DataFrame:
    """aggregating the data based on 'PHYSICALID' and 'Year' of issue date and sorting based on 'PHYSICALID'"""

    # df = joined_df.withColumn("year", F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy")))
    df = (joined_df.repartition(5, "year").groupBy("PHYSICALID").pivot(
        "year", [2015, 2016, 2017, 2018, 2019]).sum("total_cnt"))
    df = (df.withColumn(
        "2015",
        F.when(F.col("2015").isNull(), 0).otherwise(
            F.col("2015"))).withColumnRenamed("2015", "COUNT_2015").withColumn(
                "2016",
                F.when(F.col("2016").isNull(),
                       0).otherwise(F.col("2016"))).withColumnRenamed(
                           "2016", "COUNT_2016").withColumn(
                               "2017",
                               F.when(F.col("2017").isNull(), 0).otherwise(
                                   F.col("2017"))).withColumnRenamed(
                                       "2017", "COUNT_2017").withColumn(
                                           "2018",
                                           F.when(F.col("2018").isNull(),
                                                  0).otherwise(F.col("2018"))).
          withColumnRenamed("2018", "COUNT_2018").withColumn(
              "2019",
              F.when(F.col("2019").isNull(),
                     0).otherwise(F.col("2019"))).withColumnRenamed(
                         "2019", "COUNT_2019").sort("PHYSICALID"))
    return df
Beispiel #3
0
def repartition_df(
    dataframe: DataFrame,
    partition_by: List[str],
    num_partitions: int = None,
    num_processors: int = None,
):
    """Partition the DataFrame.

    Args:
        dataframe: Spark DataFrame.
        partition_by: list of partitions.
        num_processors: number of processors.
        num_partitions: number of partitions.

    Returns:
        Partitioned dataframe.

    """
    num_partitions = _num_partitions_definition(num_processors, num_partitions)
    return dataframe.repartition(num_partitions, *partition_by)