def convertDfToList(DataFrame): l = DataFrame.take(DataFrame.count()) myList = [] for row in l: dic = row.asDict() myList.append(dic) return myList
def test_groupedData(self): from pyspark.sql import DataFrame from pyspark.sql.functions import sum, pandas_udf, PandasUDFType from ts.flint import TimeSeriesGroupedData price = self.price() assert(type(price.groupBy('time')) is TimeSeriesGroupedData) assert(type(price.groupby('time')) is TimeSeriesGroupedData) result1 = price.groupBy('time').agg(sum(price['price'])).sort('time').toPandas() expected1 = DataFrame.groupBy(price, 'time').agg(sum(price['price'])).sort('time').toPandas() assert_same(result1, expected1) result2 = price.groupBy('time').pivot('id').sum('price').toPandas() expected2 = DataFrame.groupBy(price, 'time').pivot('id').sum('price').toPandas() assert_same(result2, expected2) @pandas_udf(price.schema, PandasUDFType.GROUPED_MAP) def foo(df): return df result3 = price.groupby('time').apply(foo).toPandas() expected3 = DataFrame.groupBy(price, 'time').apply(foo).toPandas() assert_same(result3, expected3) result4 = price.groupby('time').count().toPandas() expected4 = DataFrame.groupBy(price, 'time').count().toPandas() assert_same(result4, expected4) result5 = price.groupby('time').mean('price').toPandas() expected5 = DataFrame.groupBy(price, 'time').mean('price').toPandas() assert_same(result5, expected5)
def _unpack_row(jdf): df = DataFrame(jdf, _sql) row = df.first() l = list(row) if len(l) == 1: return l[0] return l
def predict(self, X): if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) retPDF = retDF.sort('ID').select('prediction').toPandas() if isinstance(X, np.ndarray): return retPDF.as_matrix().flatten() else: return retPDF else: retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))) if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if 'features' in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol='features') df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) # Return DF return retDF.sort('ID') else: raise Exception('Unsupported input type')
def _unpack_row(jdf, fetches): df = DataFrame(jdf, _sql) row = df.first() def f(fetch): name = fetch.name.replace(":0", "") x = row[name] ndims = fetch.get_shape().ndims if ndims > 0: return np.array(x) else: return x l = [f(fetch) for fetch in fetches] if len(l) == 1: return l[0] return l
def toPython(entity): from py4j.java_gateway import JavaObject if entity is None or not isinstance(entity, JavaObject): return entity clazz = entity.getClass().getName() if clazz == "org.apache.spark.sql.Dataset": entity = entity.toDF() clazz = "org.apache.spark.sql.DataFrame" if clazz == "org.apache.spark.sql.DataFrame": from pyspark.sql import DataFrame, SQLContext from pyspark import SparkContext entity = DataFrame(entity, SQLContext(SparkContext.getOrCreate(), entity.sqlContext())) return entity
def predict(self, X): """ Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame """ try: if self.estimator is not None and self.model is not None: self.estimator.copyProperties(self.model) except AttributeError: pass if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) retPDF = retDF.sort('__INDEX').select('prediction').toPandas() if isinstance(X, np.ndarray): return self.decode(retPDF.as_matrix().flatten()) else: return self.decode(retPDF) else: try: retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))) except Py4JError: traceback.print_exc() if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if self.features_col in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col) df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) # Return DF return retDF.sort('__INDEX') else: raise Exception('Unsupported input type')
def _convertJavaOutputToPythonObject(self, X, output): """ Converts the a java-side object output (either MatrixBlock or Java DataFrame) to a python object (based on the type of X). Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame output: a java-side object (either MatrixBlock or Java DataFrame) """ if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF: retDF = DataFrame(output, self.sparkSession) retPDF = retDF.sort('__INDEX').select('prediction').toPandas() return retPDF.as_matrix().flatten() if isinstance(X, np.ndarray) else retPDF elif isinstance(X, SUPPORTED_TYPES): return convertToNumPyArr(self.sc, output) elif hasattr(X, '_jdf'): retDF = DataFrame(output, self.sparkSession) # Return DF return retDF.sort('__INDEX') else: raise Exception('Unsupported input type')
def assert_equal(expected: DataFrame, actual: DataFrame, verbose: bool = True, ignore_schema_metadata: bool = True): """Assert that two DataFrames contain the same data. :param expected: The expected DataFrame :param actual: The actual DataFrame :param verbose: If the DataFrames are not equal, show the DataFrame schema or data (depending on where the mismatch is) in order to help debugging. :param ignore_schema_metadata: When comparing the schemas, ignore the metadata, which can include comments. # comments. :return: None """ if expected is None: assert actual is None, \ 'The expected DataFrame is None, but the actual DataFrame is not.' return # Both DataFrames are None. else: assert actual is not None, \ 'The actual DataFrame is None, but the expected DataFrame is not.' expected.persist() actual.persist() expected_schema = _copy_without_metadata(expected.schema) if \ ignore_schema_metadata else expected.schema actual_schema = _copy_without_metadata(actual.schema) if \ ignore_schema_metadata else actual.schema is_schema_match = expected_schema == actual_schema if not is_schema_match: if verbose: # Print the schema to help identify subtle cases such as one DF # containing an INT and the other containing a BIGINT, which can be # an issue if we try to write a DF containing a BIGINT into a # previously existing Hive table defined to contain an INT. print('Expected DataFrame schema:') expected.printSchema() print('Actual DataFrame schema:') actual.printSchema() assert is_schema_match is True, 'The DataFrame schemas differ.' is_match = (0 == (expected.count() - actual.count()) == expected.subtract(actual).count() == actual.subtract(expected).count()) if not is_match: if verbose: print('Expected DataFrame:') expected.show(20, False) print('Actual DataFrame:') actual.show(20, False) # Possible future enhancement: Make the assertion failure message more # helpful. assert is_match is True, 'The DataFrames differ.'
# COMMAND ---------- dt1 = joe_pos_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_total')) dt2 = joe_pos_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_pos')) dt3 = joe_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_neg')) dt4 = trump_pos_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_total')) dt5 = trump_pos_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_pos')) dt6 = trump_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_neg')) # COMMAND ---------- dfs = [dt1, dt2, dt3, dt4, dt5, dt6] #df_final = reduce(lambda left, right: left.join(right,["date_only","state"]), dfs) df_final = reduce(lambda left, right: DataFrame.join(left, right, on='state'), dfs) df_final = df_final.sort(F.col('joe_total').asc()) df_final.write.mode('overwrite').parquet("/FileStore/tables/post_analysis3to8/") print(df_final.show()) # COMMAND ---------- df=spark.read.parquet("/FileStore/tables/post_analysis3to8/") df.show(2000) # COMMAND ---------- df.repartition(1).write.csv(path="/FileStore/countposneg.csv", mode="overwrite", header="true") # COMMAND ----------
def createDF(self, schema, data=None): return DataFrame(self.j_smvPyClient.dfFrom(schema, data), self.sqlContext)
def runModule(self, urn, forceRun=False): """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn) """ jdf = self.j_smvPyClient.runModule(urn, forceRun) return DataFrame(jdf, self.sqlContext)
def get_metadata_of_column(df: DataFrame, column: str) -> Any: return df.select(column).schema[0].metadata
def spark_is_data_frame_empty(df: DataFrame) -> bool: """ Efficient way to check if the data frame is empty without getting the count of the whole data frame """ # from: https://stackoverflow.com/questions/32707620/how-to-check-if-spark-dataframe-is-empty return not bool(df.head(1))
def extract_contact_form(raw_sharing_df: DataFrame) -> DataFrame: contact_form_df = raw_sharing_df.select("form_url").where( "form_url is not null") contact_form_df = contact_form_df.drop_duplicates() return contact_form_df
def assert_data_frames_equal(left: DataFrame, right: DataFrame) -> None: testing.assert_frame_equal( left.orderBy(left.columns).toPandas(), # type: ignore right.orderBy(right.columns).toPandas(), # type: ignore )
def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF: return table_one.union(table_two)
def sparkle_df(df: DataFrame) -> SparkleDataFrame: df.__class__ = type('SparkleDataFrame', (SparkleDataFrame, ), {}) # noinspection PyTypeChecker return df
def increase_sal(df: DataFrame) -> DataFrame: df_new = df.withColumn("sal", add_udf(col("sal").cast(IntegerType()))) return df_new
def df(self): if self.__df is None: self.__df = DataFrame(self._jCypherResult.df(), self._sqlContext) return self.__df
def read_pandas_df_to_spark(_, df: SparkDF): assert isinstance(df, SparkDF) assert df.count() == 2 assert set(df.columns) == {"foo", "quux"}
def __call__(self, files: DataFrame) -> DataFrame: # if UAST is not extracted, returns an empty list that we filter out here return files.extract_uasts().where( functions.size(functions.col("uast")) > 0)
def filter_failure(self, data_frame: DataFrame) -> DataFrame: return data_frame.filter(f"{self.constraint_column_name} > 1")
def update_or_append(self, df: DataFrame, table: str, id_col: str): ids_to_delete = [ el[0] for el in df.select(id_col).distinct().collect() ] self.delete_in(table, id_col, ids_to_delete) self.store(df, table)
def create_temp_view(dataframe: DataFrame, name): dataframe.createOrReplaceTempView(name)
def get_columns_expected_order(provider_group_df: DataFrame) -> DataFrame: return provider_group_df.select("id", "form_url")
def load(self, df: DataFrame, date: datetime): self.report_cls(date=date, company=self.company).save(df=df.toPandas())
def add_metadata_to_column(df: DataFrame, column: str, metadata: Any) -> DataFrame: return df.withColumn(column, df[column].alias(column, metadata=metadata))
def toDF(self): """ Get a DataFrame representation of this Delta table. """ return DataFrame(self._jdt.toDF(), self._spark._wrapped)
def subset_dataset_by_label(sc, dataframe: DataFrame, label_col, *args): if args: return dataframe.filter(F.col(label_col).isin(list(args))) else: return dataframe
def transform(self, dataset, params={}): java_obj = self._java_obj() self._transfer_params_to_java({}, java_obj) java_param_map = self._create_java_param_map(params, java_obj) return DataFrame(java_obj.transform(dataset._jdf, java_param_map), dataset.sql_ctx)
def runModuleByName(self, name, forceRun=False): jdf = self.j_smvApp.runModuleByName(name, forceRun) return DataFrame(jdf, self.sqlContext)
def sparse_dot_product_cross_join( spark: SQLContext, output_col: str, primary_row_number_col: str, primary_vector_col: str, primary_df: DataFrame, secondary_row_number_col: str, secondary_vector_col: str, secondary_df: DataFrame, ): """Calculate the dot product for every pair of items between a column of SparseVectors in the primary dataframe and a column of SparseVectors in the secondary dataframe. The input dataframes must have a row number attached. This will correspond to the row number in ther resulting row matrix. It does not matter if the row numbers are sequential as long as they are unique within their dataframes respectively. NOTE: if you are using this function in order to generate cosine similarity scores then remember to normalize your input vectors first. This way the resulting coordinate matrix will represent the similarity scores.""" def primary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(primary_row_number_col) vec = row.__getitem__(primary_vector_col) return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)] primary_rdd = primary_df.select(F.col(primary_row_number_col), F.col(primary_vector_col)).rdd.flatMap( lambda row: primary_row_to_coords(row)) if primary_rdd.isEmpty(): raise ValueError( "Primary RDD is empty. Cannot perform matrix multiplication") primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) def secondary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(secondary_row_number_col) vec = row.__getitem__(secondary_vector_col) # IMPORTANT - note that we are actually creating # the transpose of the secondary matrix hence # why the coordinates are back to front return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)] secondary_rdd = secondary_df.select( F.col(secondary_row_number_col), F.col(secondary_vector_col)).rdd.flatMap( lambda row: secondary_row_to_coords(row)) secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) if secondary_rdd.isEmpty(): raise ValueError( "Secondary RDD is empty. Cannot perform matrix multiplication") # create the primary coordinate matrix from the coords primary_matrix = CoordinateMatrix(primary_rdd) log.info( "finished creating primary coordinate matrix", rows=primary_matrix.numRows(), cols=primary_matrix.numCols(), ) # create the secondary coordinate matrix from the coords secondary_matrix = CoordinateMatrix(secondary_rdd) log.info( "finished creating secondary coordinate matrix transpose", rows=secondary_matrix.numRows(), cols=secondary_matrix.numCols(), ) coords_matrix = multiply_coordinate_matrices(primary_matrix, secondary_matrix) res = coord_matrix_to_dataframe( spark, primary_row_number_col, secondary_row_number_col, output_col, coords_matrix, ) primary_rdd.unpersist() secondary_rdd.unpersist() return res
def __init__(self, jdf, spark_session): DataFrame.__init__(self, jdf, spark_session) self._jrfctx = spark_session.rasterframes._jrfctx
def filter_success(self, data_frame: DataFrame) -> DataFrame: return data_frame.filter(f"{self.constraint_column_name} == 1")
def split_train_test(data: DataFrame, train_ratio: float = 0.7, test_ratio: float = 0.3): return data.randomSplit([train_ratio, test_ratio], seed=42)
def pivot( dataframe: DataFrame, group_by_columns: non_blank(List[str]), pivot_column: non_blank(str), agg_column: non_blank(str), aggregation: non_null(Callable), mock_value: non_null(object) = None, mock_type: non_null(object) = None, with_forward_fill: non_null(bool) = False, ): """Defines a pivot transformation. Attributes: dataframe: dataframe to be pivoted. group_by_columns: list of columns' names to be grouped. pivot_column: column to be pivoted. agg_column: column to be aggregated by pivoted category. aggregation: desired spark aggregation function to be performed. An example: spark_agg(col_name). See docs for all spark_agg: https://spark.apache.org/docs/2.3.1/api/python/_modules/pyspark/sql/functions.html mock_value: value used to make a difference between true nulls resulting from the aggregation and empty values from the pivot transformation. mock_type: mock_value data type (compatible with spark). with_forward_fill: applies a forward fill to null values after the pivot operation. Example: >>> dataframe.orderBy("ts", "id", "amenity").show() +---+---+-------+-----+ | id| ts|amenity| has| +---+---+-------+-----+ | 1| 1| fridge|false| | 1| 1| oven| true| | 1| 1| pool|false| | 2| 2|balcony|false| | 1| 3|balcony| null| | 1| 4| oven| null| | 1| 4| pool| true| | 1| 5|balcony| true| +---+---+-------+-----+ >>> pivoted = pivot(dataframe, ["id", "ts"], "amenity", "has", functions.first) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| null|null| null| | 1| 4| null| null|null| true| | 1| 5| true| null|null| null| +---+---+-------+------+----+-----+ But, sometimes, you would like to keep the last values that some feature has assumed from previous modifications. In this example, amenity "oven" for the id=1 was set to null and "pool" was set to true at ts=4. All other amenities should then be kept to their actual state at that ts. To do that, we will use a technique called forward fill: >>> pivoted = pivot( ... dataframe, ... ["id", "ts"], ... "amenity", ... "has", ... functions.first, ... with_forward_fill=True ...) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| false|true|false| | 1| 4| null| false|true| true| | 1| 5| true| false|true| true| +---+---+-------+------+----+-----+ Great! Now every amenity that didn't have been changed kept it's state. BUT, the force change to null for amenity "oven" on id=1 at ts=4 was ignored during forward fill. If the user wants to respect this change, it must provide a mock value and type to be used as a signal for "true nulls". In other words, we want to forward fill only nulls that were created by the pivot transformation. In this example, amenities only assume boolean values. So there is no mock values for a boolean. It is only true or false. So users can give a mock value of another type (for which the column can be cast to). Check this out: >>> pivoted = pivot( ... dataframe, ... ["id", "ts"], ... "amenity", ... "has", ... functions.first, ... with_forward_fill=True, ... mock_value=-1, ... mock_type="int" ...) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| false|true|false| | 1| 4| null| false|null| true| | 1| 5| true| false|null| true| +---+---+-------+------+----+-----+ During transformation, this method will cast the agg_column to mock_type data type and fill all "true nulls" with the mock_value. After pivot and forward fill are applied, all new pivoted columns will then return to the original type with all mock values replaced by null. """ agg_column_type = None if mock_value is not None: if mock_type is None: raise AttributeError( "When proving a mock value, users must inform the data type," " which should be supported by Spark.") agg_column_type = dict(dataframe.dtypes).get(agg_column) dataframe = dataframe.withColumn( agg_column, functions.col(agg_column).cast(mock_type)).fillna( {agg_column: mock_value}) pivoted = (dataframe.groupBy(*group_by_columns).pivot(pivot_column).agg( aggregation(agg_column))) new_columns = [c for c in pivoted.columns if c not in group_by_columns] if with_forward_fill: for c in new_columns: pivoted = forward_fill( dataframe=pivoted, partition_by=group_by_columns[:-1], order_by=group_by_columns[-1], fill_column=c, ) if mock_value is not None: for c in new_columns: pivoted = pivoted.withColumn( c, functions.when( functions.col(c) != mock_value, functions.col(c)).cast(agg_column_type), ) return pivoted
def showNewsByCategory(DataFrame): """某一种类下的所有新闻,按时间排序""" temp = DataFrame.orderBy('time',ascending=1).where(DataFrame['label']==u'科技').select("*") return convertDfToList(temp)
def _transform(self, dataset): self._transfer_params_to_java() return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
def showFirstNnews(DataFrame): """显示数据源下的前10条新闻,按时间排序""" temp = DataFrame.orderBy('time',ascending=1).limit(10).select("*") return convertDfToList(temp)
def clean(frame: DataFrame) -> DataFrame: # First, get the majority of columns “fixed”, i.e. their datatypes improved. df2 = (frame.withColumnRenamed('Code', 'Airport_Code').withColumnRenamed( 'Description', 'Airport_Description')) return df2