def concat(dfs, like="columns"): """ Concat multiple dataframes as columns or rows :param dfs: :param like: The way dataframes is going to be concat. like columns or rows :return: """ # Add increasing Ids, and they should be the same. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append_df(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append_df, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def append(dfs, like="columns"): """ Concat multiple dataframes as columns or rows :param dfs: :param like: The way dataframes is going to be concat. like columns or rows :return: """ # FIX: Because monotonically_increasing_id can create different # sequence for different dataframes the result could be wrong. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def append(dfs, like="columns"): """ Concat multiple dataFrames columns or rows wise :param dfs: List of DataFrames :param like: concat as columns or rows :return: """ # FIX: Because monotonically_increasing_id can create different # sequence for different dataframes the result could be wrong. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() dfs = val_to_list(dfs) for df in dfs: from pyspark.sql import functions as F temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append, temp_dfs).drop(col_temp_name) elif like == "rows": from pyspark.sql import DataFrame df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def sample_n(self, n=10, random=False): """ Return a n number of sample from a dataFrame :param self: :param n: Number of samples :param random: if true get a semi random sample :return: """ if random is True: seed = random_int() elif random is False: seed = 0 rows_count = self.count() fraction = n / rows_count return self.sample(False, fraction, seed=seed)
def sample_n(self, n=10, random=False): """ Return a n number of sample from a dataFrame :param self: :param n: Number of samples :param random: if true get a semi random sample :return: """ if random is True: seed = random_int() elif random is False: seed = 0 else: RaiseIt.value_error(random, ["True", "False"]) rows_count = self.count() if n < rows_count: fraction = n / rows_count else: fraction = 1.0 return self.sample(False, fraction, seed=seed)
def sample_n(self, n=10, random=False): """ Return a n number of sample from a dataFrame :param self: :param n: Number of samples :param random: if true get a semi random sample :return: """ if random is True: seed = random_int() elif random is False: seed = 0 else: RaiseIt.value_error(random, ["True", "False"]) rows_count = self.count() if n < rows_count: # n/rows_count can return a number that represent less the total number we expect. multiply by 1.1 bo fraction = (n / rows_count) * 1.1 else: fraction = 1.0 return self.sample(False, fraction, seed=seed).limit(n)