Beispiel #1
0
def concat(dfs, like="columns"):
    """
    Concat multiple dataframes as columns or rows
    :param dfs:
    :param like: The way dataframes is going to be concat. like columns or rows
    :return:
    """
    # Add increasing Ids, and they should be the same.
    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append_df(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append_df, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Beispiel #2
0
def append(dfs, like="columns"):
    """
    Concat multiple dataframes as columns or rows
    :param dfs:
    :param like: The way dataframes is going to be concat. like columns or rows
    :return:
    """

    # FIX: Because monotonically_increasing_id can create different
    # sequence for different dataframes the result could be wrong.

    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Beispiel #3
0
def append(dfs, like="columns"):
    """
    Concat multiple dataFrames columns or rows wise
    :param dfs: List of DataFrames
    :param like: concat as columns or rows
    :return:
    """

    # FIX: Because monotonically_increasing_id can create different
    # sequence for different dataframes the result could be wrong.

    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()

        dfs = val_to_list(dfs)
        for df in dfs:
            from pyspark.sql import functions as F
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        from pyspark.sql import DataFrame
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Beispiel #4
0
def sample_n(self, n=10, random=False):
    """
    Return a n number of sample from a dataFrame
    :param self:
    :param n: Number of samples
    :param random: if true get a semi random sample
    :return:
    """
    if random is True:
        seed = random_int()
    elif random is False:
        seed = 0

    rows_count = self.count()
    fraction = n / rows_count
    return self.sample(False, fraction, seed=seed)
Beispiel #5
0
def sample_n(self, n=10, random=False):
    """
    Return a n number of sample from a dataFrame
    :param self:
    :param n: Number of samples
    :param random: if true get a semi random sample
    :return:
    """
    if random is True:
        seed = random_int()
    elif random is False:
        seed = 0
    else:
        RaiseIt.value_error(random, ["True", "False"])

    rows_count = self.count()
    if n < rows_count:
        fraction = n / rows_count
    else:
        fraction = 1.0

    return self.sample(False, fraction, seed=seed)
Beispiel #6
0
def sample_n(self, n=10, random=False):
    """
    Return a n number of sample from a dataFrame
    :param self:
    :param n: Number of samples
    :param random: if true get a semi random sample
    :return:
    """
    if random is True:
        seed = random_int()
    elif random is False:
        seed = 0
    else:
        RaiseIt.value_error(random, ["True", "False"])

    rows_count = self.count()
    if n < rows_count:
        # n/rows_count can return a number that represent less the total number we expect. multiply by 1.1 bo
        fraction = (n / rows_count) * 1.1
    else:
        fraction = 1.0

    return self.sample(False, fraction, seed=seed).limit(n)