def convertDfToList(DataFrame):
    l = DataFrame.take(DataFrame.count())
    myList = []
    for row in l:
        dic = row.asDict()
        myList.append(dic)
    return myList
Beispiel #2
0
    def test_groupedData(self):
        from pyspark.sql import DataFrame
        from pyspark.sql.functions import sum, pandas_udf, PandasUDFType
        from ts.flint import TimeSeriesGroupedData

        price = self.price()

        assert(type(price.groupBy('time')) is TimeSeriesGroupedData)
        assert(type(price.groupby('time')) is TimeSeriesGroupedData)

        result1 = price.groupBy('time').agg(sum(price['price'])).sort('time').toPandas()
        expected1 = DataFrame.groupBy(price, 'time').agg(sum(price['price'])).sort('time').toPandas()
        assert_same(result1, expected1)

        result2 = price.groupBy('time').pivot('id').sum('price').toPandas()
        expected2 = DataFrame.groupBy(price, 'time').pivot('id').sum('price').toPandas()
        assert_same(result2, expected2)

        @pandas_udf(price.schema, PandasUDFType.GROUPED_MAP)
        def foo(df):
            return df
        result3 = price.groupby('time').apply(foo).toPandas()
        expected3 = DataFrame.groupBy(price, 'time').apply(foo).toPandas()
        assert_same(result3, expected3)

        result4 = price.groupby('time').count().toPandas()
        expected4 = DataFrame.groupBy(price, 'time').count().toPandas()
        assert_same(result4, expected4)

        result5 = price.groupby('time').mean('price').toPandas()
        expected5 = DataFrame.groupBy(price, 'time').mean('price').toPandas()
        assert_same(result5, expected5)
Beispiel #3
0
def _unpack_row(jdf):
    df = DataFrame(jdf, _sql)
    row = df.first()
    l = list(row)
    if len(l) == 1:
        return l[0]
    return l
 def predict(self, X):
     if isinstance(X, SUPPORTED_TYPES):
         if self.transferUsingDF:
             pdfX = convertToPandasDF(X)
             df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
             retjDF = self.model.transform(df._jdf)
             retDF = DataFrame(retjDF, self.sqlCtx)
             retPDF = retDF.sort('ID').select('prediction').toPandas()
             if isinstance(X, np.ndarray):
                 return retPDF.as_matrix().flatten()
             else:
                 return retPDF
         else:
             retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
             if isinstance(X, np.ndarray):
                 return retNumPy
             else:
                 return retNumPy # TODO: Convert to Pandas
     elif hasattr(X, '_jdf'):
         if 'features' in X.columns:
             # No need to assemble as input DF is likely coming via MLPipeline
             df = X
         else:
             assembler = VectorAssembler(inputCols=X.columns, outputCol='features')
             df = assembler.transform(X)
         retjDF = self.model.transform(df._jdf)
         retDF = DataFrame(retjDF, self.sqlCtx)
         # Return DF
         return retDF.sort('ID')
     else:
         raise Exception('Unsupported input type')
Beispiel #5
0
def _unpack_row(jdf, fetches):
    df = DataFrame(jdf, _sql)
    row = df.first()
    def f(fetch):
        name = fetch.name.replace(":0", "")
        x = row[name]
        ndims = fetch.get_shape().ndims
        if ndims > 0:
            return np.array(x)
        else:
            return x
    l = [f(fetch) for fetch in fetches]
    if len(l) == 1:
        return l[0]
    return l
Beispiel #6
0
        def toPython(entity):
            from py4j.java_gateway import JavaObject
            if entity is None or not isinstance(entity, JavaObject):
                return entity

            clazz = entity.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                entity = entity.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                from pyspark.sql import DataFrame, SQLContext
                from pyspark import SparkContext
                entity = DataFrame(entity, SQLContext(SparkContext.getOrCreate(), entity.sqlContext()))

            return entity
Beispiel #7
0
    def predict(self, X):
        """
        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        try:
            if self.estimator is not None and self.model is not None:
                self.estimator.copyProperties(self.model)
        except AttributeError:
            pass
        if isinstance(X, SUPPORTED_TYPES):
            if self.transferUsingDF:
                pdfX = convertToPandasDF(X)
                df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col)
                retjDF = self.model.transform(df._jdf)
                retDF = DataFrame(retjDF, self.sparkSession)
                retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
                if isinstance(X, np.ndarray):
                    return self.decode(retPDF.as_matrix().flatten())
                else:
                    return self.decode(retPDF)
            else:
                try:
                    retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))))
                except Py4JError:
                    traceback.print_exc()
                if isinstance(X, np.ndarray):
                    return retNumPy
                else:
                    return retNumPy # TODO: Convert to Pandas
        elif hasattr(X, '_jdf'):
            if self.features_col in X.columns:
                # No need to assemble as input DF is likely coming via MLPipeline
                df = X
            else:
                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col)
                df = assembler.transform(X)
            retjDF = self.model.transform(df._jdf)
            retDF = DataFrame(retjDF, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
    def _convertJavaOutputToPythonObject(self, X, output):
        """
        Converts the a java-side object output (either MatrixBlock or Java DataFrame) to a python object (based on the type of X).

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        output: a java-side object (either MatrixBlock or Java DataFrame)
        """
        if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF:
            retDF = DataFrame(output, self.sparkSession)
            retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
            return retPDF.as_matrix().flatten() if isinstance(X, np.ndarray) else retPDF
        elif isinstance(X, SUPPORTED_TYPES):
            return convertToNumPyArr(self.sc, output)
        elif hasattr(X, '_jdf'):
            retDF = DataFrame(output, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
def assert_equal(expected: DataFrame,
                 actual: DataFrame,
                 verbose: bool = True,
                 ignore_schema_metadata: bool = True):
    """Assert that two DataFrames contain the same data.
    :param expected: The expected DataFrame
    :param actual: The actual DataFrame
    :param verbose: If the DataFrames are not equal, show
    the DataFrame schema or data (depending on where the mismatch is) in order
    to help debugging.
    :param ignore_schema_metadata: When comparing the schemas, ignore the
    metadata, which can include comments.
    # comments.
    :return: None
    """
    if expected is None:
        assert actual is None, \
            'The expected DataFrame is None, but the actual DataFrame is not.'
        return  # Both DataFrames are None.
    else:
        assert actual is not None, \
            'The actual DataFrame is None, but the expected DataFrame is not.'

    expected.persist()
    actual.persist()

    expected_schema = _copy_without_metadata(expected.schema) if \
        ignore_schema_metadata else expected.schema
    actual_schema = _copy_without_metadata(actual.schema) if \
        ignore_schema_metadata else actual.schema

    is_schema_match = expected_schema == actual_schema
    if not is_schema_match:
        if verbose:
            # Print the schema to help identify subtle cases such as one DF
            # containing an INT and the other containing a BIGINT, which can be
            # an issue if we try to write a DF containing a BIGINT into a
            # previously existing Hive table defined to contain an INT.
            print('Expected DataFrame schema:')
            expected.printSchema()
            print('Actual DataFrame schema:')
            actual.printSchema()
        assert is_schema_match is True, 'The DataFrame schemas differ.'

    is_match = (0 == (expected.count() - actual.count()) ==
                expected.subtract(actual).count() ==
                actual.subtract(expected).count())

    if not is_match:
        if verbose:
            print('Expected DataFrame:')
            expected.show(20, False)
            print('Actual DataFrame:')
            actual.show(20, False)
        # Possible future enhancement: Make the assertion failure message more
        # helpful.
        assert is_match is True, 'The DataFrames differ.'
# COMMAND ----------

dt1 = joe_pos_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_total'))
dt2 = joe_pos_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_pos'))
dt3 = joe_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('joe_neg'))

dt4 = trump_pos_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_total'))
dt5 = trump_pos_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_pos'))
dt6 = trump_neg_only.groupBy(F.col('state')).agg(F.count('state').alias('trump_neg'))

# COMMAND ----------

dfs = [dt1, dt2, dt3, dt4, dt5, dt6]
#df_final = reduce(lambda left, right: left.join(right,["date_only","state"]), dfs)
df_final = reduce(lambda left, right: DataFrame.join(left, right, on='state'), dfs)
df_final = df_final.sort(F.col('joe_total').asc())
df_final.write.mode('overwrite').parquet("/FileStore/tables/post_analysis3to8/")

print(df_final.show())

# COMMAND ----------

df=spark.read.parquet("/FileStore/tables/post_analysis3to8/")
df.show(2000)

# COMMAND ----------

df.repartition(1).write.csv(path="/FileStore/countposneg.csv", mode="overwrite", header="true")

# COMMAND ----------
Beispiel #11
0
 def createDF(self, schema, data=None):
     return DataFrame(self.j_smvPyClient.dfFrom(schema, data),
                      self.sqlContext)
Beispiel #12
0
 def runModule(self, urn, forceRun=False):
     """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn)
     """
     jdf = self.j_smvPyClient.runModule(urn, forceRun)
     return DataFrame(jdf, self.sqlContext)
def get_metadata_of_column(df: DataFrame, column: str) -> Any:
    return df.select(column).schema[0].metadata
def spark_is_data_frame_empty(df: DataFrame) -> bool:
    """
    Efficient way to check if the data frame is empty without getting the count of the whole data frame
    """
    # from: https://stackoverflow.com/questions/32707620/how-to-check-if-spark-dataframe-is-empty
    return not bool(df.head(1))
def extract_contact_form(raw_sharing_df: DataFrame) -> DataFrame:
    contact_form_df = raw_sharing_df.select("form_url").where(
        "form_url is not null")
    contact_form_df = contact_form_df.drop_duplicates()

    return contact_form_df
Beispiel #16
0
def assert_data_frames_equal(left: DataFrame, right: DataFrame) -> None:
    testing.assert_frame_equal(
        left.orderBy(left.columns).toPandas(),  # type: ignore
        right.orderBy(right.columns).toPandas(),  # type: ignore
    )
def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF:
    return table_one.union(table_two)
Beispiel #18
0
def sparkle_df(df: DataFrame) -> SparkleDataFrame:
    df.__class__ = type('SparkleDataFrame', (SparkleDataFrame, ), {})
    # noinspection PyTypeChecker
    return df
def increase_sal(df: DataFrame) -> DataFrame:
    df_new = df.withColumn("sal", add_udf(col("sal").cast(IntegerType())))
    return df_new
 def df(self):
     if self.__df is None:
         self.__df = DataFrame(self._jCypherResult.df(), self._sqlContext)
     return self.__df
def read_pandas_df_to_spark(_, df: SparkDF):
    assert isinstance(df, SparkDF)
    assert df.count() == 2
    assert set(df.columns) == {"foo", "quux"}
Beispiel #22
0
 def __call__(self, files: DataFrame) -> DataFrame:
     # if UAST is not extracted, returns an empty list that we filter out here
     return files.extract_uasts().where(
         functions.size(functions.col("uast")) > 0)
Beispiel #23
0
 def filter_failure(self, data_frame: DataFrame) -> DataFrame:
     return data_frame.filter(f"{self.constraint_column_name} > 1")
Beispiel #24
0
 def update_or_append(self, df: DataFrame, table: str, id_col: str):
     ids_to_delete = [
         el[0] for el in df.select(id_col).distinct().collect()
     ]
     self.delete_in(table, id_col, ids_to_delete)
     self.store(df, table)
Beispiel #25
0
def create_temp_view(dataframe: DataFrame, name):
    dataframe.createOrReplaceTempView(name)
def get_columns_expected_order(provider_group_df: DataFrame) -> DataFrame:
    return provider_group_df.select("id", "form_url")
Beispiel #27
0
 def load(self, df: DataFrame, date: datetime):
     self.report_cls(date=date, company=self.company).save(df=df.toPandas())
def add_metadata_to_column(df: DataFrame, column: str,
                           metadata: Any) -> DataFrame:
    return df.withColumn(column, df[column].alias(column, metadata=metadata))
Beispiel #29
0
 def toDF(self):
     """
     Get a DataFrame representation of this Delta table.
     """
     return DataFrame(self._jdt.toDF(), self._spark._wrapped)
Beispiel #30
0
def subset_dataset_by_label(sc, dataframe: DataFrame, label_col, *args):
    if args:
        return dataframe.filter(F.col(label_col).isin(list(args)))
    else:
        return dataframe
Beispiel #31
0
 def transform(self, dataset, params={}):
     java_obj = self._java_obj()
     self._transfer_params_to_java({}, java_obj)
     java_param_map = self._create_java_param_map(params, java_obj)
     return DataFrame(java_obj.transform(dataset._jdf, java_param_map),
                      dataset.sql_ctx)
Beispiel #32
0
 def runModuleByName(self, name, forceRun=False):
     jdf = self.j_smvApp.runModuleByName(name, forceRun)
     return DataFrame(jdf, self.sqlContext)
Beispiel #33
0
def sparse_dot_product_cross_join(
    spark: SQLContext,
    output_col: str,
    primary_row_number_col: str,
    primary_vector_col: str,
    primary_df: DataFrame,
    secondary_row_number_col: str,
    secondary_vector_col: str,
    secondary_df: DataFrame,
):
    """Calculate the dot product for every pair of items between
    a column of SparseVectors in the primary dataframe and a
    column of SparseVectors in the secondary dataframe.

    The input dataframes must have a row number attached. This will
    correspond to the row number in ther resulting row matrix.
    It does not matter if the row numbers are sequential as long
    as they are unique within their dataframes respectively.

    NOTE: if you are using this function in order to generate cosine similarity
    scores then remember to normalize your input vectors first. This way the
    resulting coordinate matrix will represent the similarity scores."""
    def primary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(primary_row_number_col)
        vec = row.__getitem__(primary_vector_col)
        return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)]

    primary_rdd = primary_df.select(F.col(primary_row_number_col),
                                    F.col(primary_vector_col)).rdd.flatMap(
                                        lambda row: primary_row_to_coords(row))

    if primary_rdd.isEmpty():
        raise ValueError(
            "Primary RDD is empty. Cannot perform matrix multiplication")

    primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    def secondary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(secondary_row_number_col)
        vec = row.__getitem__(secondary_vector_col)
        # IMPORTANT - note that we are actually creating
        # the transpose of the secondary matrix hence
        # why the coordinates are back to front
        return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)]

    secondary_rdd = secondary_df.select(
        F.col(secondary_row_number_col),
        F.col(secondary_vector_col)).rdd.flatMap(
            lambda row: secondary_row_to_coords(row))

    secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    if secondary_rdd.isEmpty():
        raise ValueError(
            "Secondary RDD is empty. Cannot perform matrix multiplication")

    # create the primary coordinate matrix from the coords
    primary_matrix = CoordinateMatrix(primary_rdd)

    log.info(
        "finished creating primary coordinate matrix",
        rows=primary_matrix.numRows(),
        cols=primary_matrix.numCols(),
    )

    # create the secondary coordinate matrix from the coords
    secondary_matrix = CoordinateMatrix(secondary_rdd)

    log.info(
        "finished creating secondary coordinate matrix transpose",
        rows=secondary_matrix.numRows(),
        cols=secondary_matrix.numCols(),
    )
    coords_matrix = multiply_coordinate_matrices(primary_matrix,
                                                 secondary_matrix)

    res = coord_matrix_to_dataframe(
        spark,
        primary_row_number_col,
        secondary_row_number_col,
        output_col,
        coords_matrix,
    )

    primary_rdd.unpersist()
    secondary_rdd.unpersist()

    return res
Beispiel #34
0
 def __init__(self, jdf, spark_session):
     DataFrame.__init__(self, jdf, spark_session)
     self._jrfctx = spark_session.rasterframes._jrfctx
Beispiel #35
0
 def filter_success(self, data_frame: DataFrame) -> DataFrame:
     return data_frame.filter(f"{self.constraint_column_name} == 1")
def split_train_test(data: DataFrame,
                     train_ratio: float = 0.7,
                     test_ratio: float = 0.3):
    return data.randomSplit([train_ratio, test_ratio], seed=42)
Beispiel #37
0
def pivot(
    dataframe: DataFrame,
    group_by_columns: non_blank(List[str]),
    pivot_column: non_blank(str),
    agg_column: non_blank(str),
    aggregation: non_null(Callable),
    mock_value: non_null(object) = None,
    mock_type: non_null(object) = None,
    with_forward_fill: non_null(bool) = False,
):
    """Defines a pivot transformation.

    Attributes:
        dataframe: dataframe to be pivoted.
        group_by_columns: list of columns' names to be grouped.
        pivot_column: column to be pivoted.
        agg_column: column to be aggregated by pivoted category.
        aggregation: desired spark aggregation function to be performed.
            An example: spark_agg(col_name). See docs for all spark_agg:
            https://spark.apache.org/docs/2.3.1/api/python/_modules/pyspark/sql/functions.html
        mock_value: value used to make a difference between true nulls resulting from
            the aggregation and empty values from the pivot transformation.
        mock_type: mock_value data type (compatible with spark).
        with_forward_fill: applies a forward fill to null values after the pivot
            operation.

    Example:

        >>> dataframe.orderBy("ts", "id", "amenity").show()
        +---+---+-------+-----+
        | id| ts|amenity|  has|
        +---+---+-------+-----+
        |  1|  1| fridge|false|
        |  1|  1|   oven| true|
        |  1|  1|   pool|false|
        |  2|  2|balcony|false|
        |  1|  3|balcony| null|
        |  1|  4|   oven| null|
        |  1|  4|   pool| true|
        |  1|  5|balcony| true|
        +---+---+-------+-----+

        >>> pivoted = pivot(dataframe, ["id", "ts"], "amenity", "has", functions.first)
        >>> pivoted.orderBy("ts", "id").show()
        +---+---+-------+------+----+-----+
        | id| ts|balcony|fridge|oven| pool|
        +---+---+-------+------+----+-----+
        |  1|  1|   null| false|true|false|
        |  2|  2|  false|  null|null| null|
        |  1|  3|   null|  null|null| null|
        |  1|  4|   null|  null|null| true|
        |  1|  5|   true|  null|null| null|
        +---+---+-------+------+----+-----+

        But, sometimes, you would like to keep the last values that some feature has
        assumed from previous modifications. In this example, amenity "oven" for the
        id=1 was set to null and "pool" was set to true at ts=4. All other amenities
        should then be kept to their actual state at that ts. To do that, we will use
        a technique called forward fill:

        >>> pivoted = pivot(
        ...     dataframe,
        ...     ["id", "ts"],
        ...     "amenity",
        ...     "has",
        ...     functions.first,
        ...     with_forward_fill=True
        ...)
        >>> pivoted.orderBy("ts", "id").show()
        +---+---+-------+------+----+-----+
        | id| ts|balcony|fridge|oven| pool|
        +---+---+-------+------+----+-----+
        |  1|  1|   null| false|true|false|
        |  2|  2|  false|  null|null| null|
        |  1|  3|   null| false|true|false|
        |  1|  4|   null| false|true| true|
        |  1|  5|   true| false|true| true|
        +---+---+-------+------+----+-----+

        Great! Now every amenity that didn't have been changed kept it's state. BUT,
        the force change to null for amenity "oven" on id=1 at ts=4 was ignored during
        forward fill. If the user wants to respect this change, it must provide a mock
        value and type to be used as a signal for "true nulls". In other words, we want
        to forward fill only nulls that were created by the pivot transformation.

        In this example, amenities only assume boolean values. So there is no mock
        values for a boolean. It is only true or false. So users can give a mock value
        of another type (for which the column can be cast to). Check this out:

        >>> pivoted = pivot(
        ...     dataframe,
        ...     ["id", "ts"],
        ...     "amenity",
        ...     "has",
        ...     functions.first,
        ...     with_forward_fill=True,
        ...     mock_value=-1,
        ...     mock_type="int"
        ...)
        >>> pivoted.orderBy("ts", "id").show()
        +---+---+-------+------+----+-----+
        | id| ts|balcony|fridge|oven| pool|
        +---+---+-------+------+----+-----+
        |  1|  1|   null| false|true|false|
        |  2|  2|  false|  null|null| null|
        |  1|  3|   null| false|true|false|
        |  1|  4|   null| false|null| true|
        |  1|  5|   true| false|null| true|
        +---+---+-------+------+----+-----+

        During transformation, this method will cast the agg_column to mock_type
        data type and fill all "true nulls" with the mock_value. After pivot and forward
        fill are applied, all new pivoted columns will then return to the original type
        with all mock values replaced by null.
    """
    agg_column_type = None
    if mock_value is not None:
        if mock_type is None:
            raise AttributeError(
                "When proving a mock value, users must inform the data type,"
                " which should be supported by Spark.")
        agg_column_type = dict(dataframe.dtypes).get(agg_column)
        dataframe = dataframe.withColumn(
            agg_column,
            functions.col(agg_column).cast(mock_type)).fillna(
                {agg_column: mock_value})

    pivoted = (dataframe.groupBy(*group_by_columns).pivot(pivot_column).agg(
        aggregation(agg_column)))

    new_columns = [c for c in pivoted.columns if c not in group_by_columns]

    if with_forward_fill:
        for c in new_columns:
            pivoted = forward_fill(
                dataframe=pivoted,
                partition_by=group_by_columns[:-1],
                order_by=group_by_columns[-1],
                fill_column=c,
            )

    if mock_value is not None:
        for c in new_columns:
            pivoted = pivoted.withColumn(
                c,
                functions.when(
                    functions.col(c) != mock_value,
                    functions.col(c)).cast(agg_column_type),
            )
    return pivoted
def showNewsByCategory(DataFrame):
    """某一种类下的所有新闻,按时间排序"""
    temp = DataFrame.orderBy('time',ascending=1).where(DataFrame['label']==u'科技').select("*")
    return convertDfToList(temp)
Beispiel #39
0
 def _transform(self, dataset):
     self._transfer_params_to_java()
     return DataFrame(self._java_obj.transform(dataset._jdf),
                      dataset.sql_ctx)
def showFirstNnews(DataFrame):
    """显示数据源下的前10条新闻,按时间排序"""
    temp = DataFrame.orderBy('time',ascending=1).limit(10).select("*")
    return convertDfToList(temp)
def clean(frame: DataFrame) -> DataFrame:
    # First, get the majority of columns “fixed”, i.e. their datatypes improved.
    df2 = (frame.withColumnRenamed('Code', 'Airport_Code').withColumnRenamed(
        'Description', 'Airport_Description'))
    return df2