Ejemplo n.º 1
0
    def test_active_session_with_None_and_not_None_context(self):
        from pyspark.context import SparkContext
        from pyspark.conf import SparkConf

        sc = None
        session = None
        try:
            sc = SparkContext._active_spark_context
            self.assertEqual(sc, None)
            activeSession = SparkSession.getActiveSession()
            self.assertEqual(activeSession, None)
            sparkConf = SparkConf()
            sc = SparkContext.getOrCreate(sparkConf)
            activeSession = sc._jvm.SparkSession.getActiveSession()
            self.assertFalse(activeSession.isDefined())
            session = SparkSession(sc)
            activeSession = sc._jvm.SparkSession.getActiveSession()
            self.assertTrue(activeSession.isDefined())
            activeSession2 = SparkSession.getActiveSession()
            self.assertNotEqual(activeSession2, None)
        finally:
            if session is not None:
                session.stop()
            if sc is not None:
                sc.stop()
Ejemplo n.º 2
0
 def test_get_active_session_when_no_active_session(self):
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
     spark = SparkSession.builder.master("local").getOrCreate()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, spark)
     spark.stop()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
Ejemplo n.º 3
0
 def test_get_active_session_when_no_active_session(self):
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
     spark = SparkSession.builder \
         .master("local") \
         .getOrCreate()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, spark)
     spark.stop()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
Ejemplo n.º 4
0
def default_session() -> SparkSession:
    spark = SparkSession.getActiveSession()
    if spark is not None:
        return spark

    builder = SparkSession.builder.appName("pandas-on-Spark")
    return builder.getOrCreate()
def fill_missing_tbins_with_zero(pickup_bin_count_df: DataFrame, n_clusters):
    now = datetime.now()
    print("fill_missing_tbins_with_zero() - starting..")
    ss = SparkSession.getActiveSession()
    print("fill_missing_tbins_with_zero() - caching data...")
    pickup_bin_count_df_pd: pd.DataFrame = pickup_bin_count_df.toPandas()
    print("fill_missing_tbins_with_zero() - caching finished")
    for cluster_id in range(0, n_clusters):
        now_for_cluster = datetime.now()

        current_cluster_df = pickup_bin_count_df_pd.loc[pickup_bin_count_df_pd.pickup_cluster == cluster_id]
        time_bins = current_cluster_df["time_bin"].unique()
        for time_bin in range(4464):  # todo добавить динамическое вычилсление количества бинов по месяцу
            # todo проверить совместимость типов str
            if time_bin not in time_bins:
                pickup_bin_count_df_pd = pickup_bin_count_df_pd.append({
                    "pickup_cluster": cluster_id,
                    "time_bin": time_bin,
                    "count": 0
                }, ignore_index=True)
        print("fill_missing_tbins_with_zero() - cluster {0} processing finished. time taken {1}".format(cluster_id,
                                                                                                        datetime.now() - now_for_cluster))
    pickup_bin_count_df_pd = pickup_bin_count_df_pd.loc[pickup_bin_count_df_pd.time_bin >= 0]
    print("fill_missing_tbins_with_zero() - time taken {}".format(datetime.now() - now))
    assert len(pickup_bin_count_df_pd.index) == 4464 * 30
    return ss.createDataFrame(pickup_bin_count_df_pd)
def import_trait_mappings() -> DataFrame:
    """Load the remote trait mappings file to a Spark dataframe."""

    remote_trait_mappings_url = (
        'https://raw.githubusercontent.com/opentargets/curation/master/mappings/disease/manual_string.tsv'
    )

    SparkSession.getActiveSession().sparkContext.addFile(remote_trait_mappings_url)

    return (
        SparkSession.getActiveSession()
        .read.csv(SparkFiles.get('manual_string.tsv'), header=True, sep='\t')
        .select(
            col('PROPERTY_VALUE').alias('diseaseFromSource'), col('SEMANTIC_TAG').alias('diseaseFromSourceMappedId')
        )
    )
Ejemplo n.º 7
0
def metric(data, metric):
    # Если по какой-то причине (случайно кто-то сломал сессию, пытаясь выполнить другую задачу и тд)
    # спарк упал, то пытаемся досчитать недосчитанное локально на одном узле
    try:
        spark_context = SparkSession.getActiveSession().sparkContext
        SQLContext(spark_context).clearCache()
    except AttributeError:
        spark_context = SparkContext.getOrCreate(
            SparkConf().setMaster("local[*]"))

        spark = SparkSession \
            .builder \
            .getOrCreate()

    data = data.drop('probability')
    try:
        if metric == 'sil':
            res = -ClusteringEvaluator(
                predictionCol='labels',
                distanceMeasure='squaredEuclidean').evaluate(data)
        elif metric == 'ch':
            res = ChIndex().find(data, spark_context)
        elif metric == 'db':
            res = DaviesIndex().find(data, spark_context)
        return res
    except TypeError:
        print("\n\nTYPE ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".format(
            data))
        return 0
    except Py4JJavaError:
        print("\n\nPy4JJavaError ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".
              format(data.printSchema()))
        return sys.float_info.max
Ejemplo n.º 8
0
    def createOrReplace(
            cls,
            sparkSession: Optional[SparkSession] = None
    ) -> "DeltaTableBuilder":
        """
        Return :class:`DeltaTableBuilder` object that can be used to specify
        the table name, location, columns, partitioning columns, table comment,
        and table properties replace a Delta table,
        error if the table doesn't exist (the same as SQL `REPLACE TABLE`).

        See :class:`DeltaTableBuilder` for a full description and examples
        of this operation.

        :param sparkSession: SparkSession to use for creating the table
        :return: an instance of DeltaTableBuilder
        :rtype: :py:class:`~delta.tables.DeltaTableBuilder`

        .. note:: Evolving
        """
        if sparkSession is None:
            sparkSession = SparkSession.getActiveSession()
        assert sparkSession is not None

        jvm: "JVMView" = sparkSession._sc._jvm  # type: ignore[attr-defined]
        jsparkSession: "JavaObject" = sparkSession._jsparkSession  # type: ignore[attr-defined]

        jdt = jvm.io.delta.tables.DeltaTable.createOrReplace(jsparkSession)
        return DeltaTableBuilder(sparkSession, jdt)
Ejemplo n.º 9
0
 def test_active_session(self):
     spark = SparkSession.builder.master("local").getOrCreate()
     try:
         activeSession = SparkSession.getActiveSession()
         df = activeSession.createDataFrame([(1, "Alice")], ["age", "name"])
         self.assertEqual(df.collect(), [Row(age=1, name="Alice")])
     finally:
         spark.stop()
Ejemplo n.º 10
0
def _get_empty_result_df(reference_df, primary_column_list):
    """
    Function to create an empty dataframe containing Primary Key columns and an additional EFFULGE_VARIANCE_PROVOKER column
    Parameters:
        Name : reference_df
        Type :     pyspark.sql.dataframe.DataFrame object
        Name : primary_column_list
        Type :     list of String
    Return Type:
        pyspark.sql.dataframe.DataFrame object
    """
    _schema = reference_df.select(*primary_column_list).schema
    _schema.add("EFFULGE_VARIANCE_PROVOKER", ArrayType(StringType()))
    #
    empty_df = SparkSession.getActiveSession().createDataFrame(
        SparkSession.getActiveSession().sparkContext.emptyRDD(), _schema)
    #
    return empty_df
Ejemplo n.º 11
0
 def __init__(self, use_pretrained):
     self.train_data = None
     self.hdfs_uri = HDFS_HOST + "/models/trained/gbt-regressor/{}".format(datetime.now().date())
     self.sc = SparkSession.getActiveSession()
     self.use_pretrained = use_pretrained
     if use_pretrained:
         self.model: GBTRegressor = self.__load_from_hdfs()
     else:
         self.model: GBTRegressor = GBTRegressor(featuresCol="features", maxIter=20, labelCol="target")
Ejemplo n.º 12
0
 def __init__(self, use_pretrained):
     self.data = None
     self.hdfs_uri = HDFS_HOST + "/models/trained/kmeans/{}".format(
         datetime.now().date())
     self.sc = SparkSession.getActiveSession()
     if use_pretrained:
         self.model: KMeansModel = self.__load_from_hdfs()
     else:
         self.model: KMeansModel = None
Ejemplo n.º 13
0
 def test_get_active_session_after_create_dataframe(self):
     session2 = None
     try:
         activeSession1 = SparkSession.getActiveSession()
         session1 = self.spark
         self.assertEqual(session1, activeSession1)
         session2 = self.spark.newSession()
         activeSession2 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession2)
         self.assertNotEqual(session2, activeSession2)
         session2.createDataFrame([(1, 'Alice')], ['age', 'name'])
         activeSession3 = SparkSession.getActiveSession()
         self.assertEqual(session2, activeSession3)
         session1.createDataFrame([(1, 'Alice')], ['age', 'name'])
         activeSession4 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession4)
     finally:
         if session2 is not None:
             session2.stop()
Ejemplo n.º 14
0
 def test_get_active_session_after_create_dataframe(self):
     session2 = None
     try:
         activeSession1 = SparkSession.getActiveSession()
         session1 = self.spark
         self.assertEqual(session1, activeSession1)
         session2 = self.spark.newSession()
         activeSession2 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession2)
         self.assertNotEqual(session2, activeSession2)
         session2.createDataFrame([(1, "Alice")], ["age", "name"])
         activeSession3 = SparkSession.getActiveSession()
         self.assertEqual(session2, activeSession3)
         session1.createDataFrame([(1, "Alice")], ["age", "name"])
         activeSession4 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession4)
     finally:
         if session2 is not None:
             session2.stop()
Ejemplo n.º 15
0
 def test_active_session(self):
     spark = SparkSession.builder \
         .master("local") \
         .getOrCreate()
     try:
         activeSession = SparkSession.getActiveSession()
         df = activeSession.createDataFrame([(1, 'Alice')], ['age', 'name'])
         self.assertEqual(df.collect(), [Row(age=1, name=u'Alice')])
     finally:
         spark.stop()
Ejemplo n.º 16
0
def _clean_cache_and_view(cached_dataframe_list, temporary_view_list):
    """
    Function to explicitly free dataframe cache and to remove temporary views

    Parameters:
        Name : cached_dataframe_list
        Type :    list of pyspark.sql.dataframe.DataFrame objects
        Name : temporary_view_list
        Type :    list of String

    Return Type:
        None
    """
    # clear previously cached list
    for d_f in cached_dataframe_list:
        d_f.unpersist(blocking=True)

    # clear temporary view
    for view in temporary_view_list:
        SparkSession.getActiveSession().catalog.dropTempView(view)
Ejemplo n.º 17
0
def _get_active_spark_session():
    try:
        from pyspark.sql import SparkSession
    except ImportError:
        # Return None if user doesn't have PySpark installed
        return None
    try:
        # getActiveSession() only exists in Spark 3.0 and above
        return SparkSession.getActiveSession()
    except Exception:
        # Fall back to this internal field for Spark 2.x and below.
        return SparkSession._instantiatedSession
Ejemplo n.º 18
0
def get_spark_session_or_start_new_with_repoconfig(
    store_config: SparkOfflineStoreConfig, ) -> SparkSession:
    spark_session = SparkSession.getActiveSession()
    if not spark_session:
        spark_builder = SparkSession.builder
        spark_conf = store_config.spark_conf
        if spark_conf:
            spark_builder = spark_builder.config(
                conf=SparkConf().setAll([(k, v)
                                         for k, v in spark_conf.items()]))

        spark_session = spark_builder.getOrCreate()
    spark_session.conf.set("spark.sql.parser.quotedRegexColumnNames", "true")
    return spark_session
Ejemplo n.º 19
0
        def wrapper(self, *args, **kwargs):  # type: ignore
            session = SparkSession.getActiveSession()
            if not session:
                return f(self, *args, **kwargs)

            session.sparkContext.setJobGroup(name, name)  # type: ignore
            start_time = time.time()
            ret = f(self, *args, **kwargs)
            _logger.info(
                f"Elapsed time (name: {name}) is {time.time() - start_time}(s)"
            )
            _clear_job_group(session)

            return ret
Ejemplo n.º 20
0
 def test_active_session_with_None_and_not_None_context(self):
     from pyspark.context import SparkContext
     from pyspark.conf import SparkConf
     sc = None
     session = None
     try:
         sc = SparkContext._active_spark_context
         self.assertEqual(sc, None)
         activeSession = SparkSession.getActiveSession()
         self.assertEqual(activeSession, None)
         sparkConf = SparkConf()
         sc = SparkContext.getOrCreate(sparkConf)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertFalse(activeSession.isDefined())
         session = SparkSession(sc)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertTrue(activeSession.isDefined())
         activeSession2 = SparkSession.getActiveSession()
         self.assertNotEqual(activeSession2, None)
     finally:
         if session is not None:
             session.stop()
         if sc is not None:
             sc.stop()
Ejemplo n.º 21
0
Archivo: utils.py Proyecto: yliou/spark
def default_session() -> SparkSession:
    spark = SparkSession.getActiveSession()
    if spark is None:
        spark = SparkSession.builder.appName("pandas-on-Spark").getOrCreate()

    # Turn ANSI off when testing the pandas API on Spark since
    # the behavior of pandas API on Spark follows pandas, not SQL.
    if is_testing():
        spark.conf.set("spark.sql.ansi.enabled",
                       False)  # type: ignore[arg-type]
    if spark.conf.get("spark.sql.ansi.enabled"):
        log_advice(
            "The config 'spark.sql.ansi.enabled' is set to True. "
            "This can cause unexpected behavior "
            "from pandas API on Spark since pandas API on Spark follows "
            "the behavior of pandas, not SQL.")

    return spark
Ejemplo n.º 22
0
def load_as_spark(url: str) -> "PySparkDataFrame":  # noqa: F821
    """
    Load the shared table using the give url as a Spark DataFrame. `PySpark` must be installed, and
    the application must be a PySpark application with the Apache Spark Connector for Delta Sharing
    installed.

    :param url: a url under the format "<profile>#<share>.<schema>.<table>"
    :return: A Spark DataFrame representing the shared table.
    """
    try:
        from pyspark.sql import SparkSession
    except ImportError:
        raise ImportError("Unable to import pyspark. `load_as_spark` requires PySpark.")

    spark = SparkSession.getActiveSession()
    assert spark is not None, (
        "No active SparkSession was found. "
        "`load_as_spark` requires running in a PySpark application."
    )
    return spark.read.format("deltaSharing").load(url)
Ejemplo n.º 23
0
    def get_table_query_string(self) -> str:
        """Returns a string that can directly be used to reference this table in SQL"""
        if self.table:
            # Backticks make sure that spark sql knows this a table reference.
            return f"`{self.table}`"
        if self.query:
            return f"({self.query})"

        # If both the table query string and the actual query are null, we can load from file.
        spark_session = SparkSession.getActiveSession()
        if spark_session is None:
            raise AssertionError("Could not find an active spark session.")
        try:
            df = spark_session.read.format(self.file_format).load(self.path)
        except Exception:
            logger.exception("Spark read of file source failed.\n" +
                             traceback.format_exc())
        tmp_table_name = get_temp_entity_table_name()
        df.createOrReplaceTempView(tmp_table_name)

        return f"`{tmp_table_name}`"
Ejemplo n.º 24
0
    def createIfNotExists(cls, sparkSession=None):
        """
        Return :class:`DeltaTableBuilder` object that can be used to specify
        the table name, location, columns, partitioning columns, table comment,
        and table properties to create a Delta table,
        if it does not exists (the same as SQL `CREATE TABLE IF NOT EXISTS`).

        See :class:`DeltaTableBuilder` for a full description and examples
        of this operation.

        :param sparkSession: SparkSession to use for creating the table
        :return: an instance of DeltaTableBuilder
        :rtype: :py:class:`~delta.tables.DeltaTableBuilder`

        .. note:: Evolving
        """
        if sparkSession is None:
            sparkSession = SparkSession.getActiveSession()
        assert sparkSession is not None
        jdt = sparkSession._sc._jvm.io.delta.tables.DeltaTable.createIfNotExists(
            sparkSession._jsparkSession)
        return DeltaTableBuilder(sparkSession, jdt)
def fill_missing_tbins_with_zero_withoud_collecting(pickup_bin_count_df: DataFrame, n_clusters):
    now = datetime.now()
    print("fill_missing_tbins_with_zero() - starting..")
    ss = SparkSession.getActiveSession()
    print("fill_missing_tbins_with_zero() - caching data...")
    pickup_bin_count_df = pickup_bin_count_df.cache()
    print("fill_missing_tbins_with_zero() - caching finished")
    for cluster_id in range(0, n_clusters):
        print("fill_missing_tbins_with_zero() - processing cluster {0}. {1} - left".format(cluster_id,
                                                                                           n_clusters - cluster_id))
        for time_bin in range(4464):  # todo добавить динамическое вычилсление количества бинов по месяцу
            row = ss.createDataFrame([(cluster_id, time_bin, 0)], "pickup_cluster int, time_bin int, count int")
            pickup_bin_count_df = pickup_bin_count_df.union(row)

    from pyspark.sql.window import Window
    import pyspark.sql.functions as F
    from pyspark.sql.functions import col

    pickup_bin_count_df = pickup_bin_count_df.select("pickup_cluster", "time_bin", "count", F.row_number().over(
        Window.partitionBy("count").orderBy(pickup_bin_count_df['count'])).alias("row_num")).sort(col("count"))
    pickup_bin_count_df = pickup_bin_count_df.filter(pickup_bin_count_df.row_num == 1).show()

    print("fill_missing_tbins_with_zero() - time taken {}".format(datetime.now() - now))
    return pickup_bin_count_df
Ejemplo n.º 26
0
def _spot_mismatch_variance(reference_view_name, received_view_name,
                            prime_columns, non_prime_columns):
    """
    Function to identify the mismatching records
    and also to identify the columns responsible for mismatch

    Parameters:
        Name: reference_view_name
        Type:    String
        Name: received_view_name
        Type:    String
        Name: prime_columns
        Type:    list/tuple of Strings
        Name: non_prime_columns
        Type:    list/tuple of Strings

    Return Type:
        pyspark.sql.dataframe.DataFrame
    """
    df_mismatch = SparkSession.getActiveSession().sql("""
            select {0} from {1}
            MINUS
            select {0} from {2}
            """.format(", ".join(
        (*prime_columns, *non_prime_columns)), reference_view_name,
                       received_view_name)).select(prime_columns)

    # retains same column names for primary attributes
    # but renames non primary attributes to have "e_" prefix
    df_expected_with_renamed_columns = SparkSession.getActiveSession().sql("""
            select
                -- primary columns
                {},
                -- non primary columns with "e_" prefix
                {}
            from {}
            """.format(
        ", ".join(prime_columns),
        ", ".join(["{0} as e_{0}".format(c) for c in non_prime_columns]),
        reference_view_name))

    # retains same column names for primary attributes
    # but renames non primary attributes to have "a_" prefix
    df_available_with_renamed_columns = SparkSession.getActiveSession().sql("""
            select
                -- primary columns
                {},
                -- non primary columns with "a_" prefix
                {}
            from {}
            """.format(
        ", ".join(prime_columns),
        ", ".join(["{0} as a_{0}".format(c) for c in non_prime_columns]),
        received_view_name))

    df_mismatch_join = df_mismatch.join(df_expected_with_renamed_columns,
                                        prime_columns, "inner").join(
                                            df_available_with_renamed_columns,
                                            prime_columns, "inner")

    # for each mismatch record, compare and identify variance columns
    try:
        df_variance = df_mismatch_join\
                          .rdd\
                          .map(
                              lambda r:
                                  _spot_corrupted_attributes(r, prime_columns, non_prime_columns,
                                                             "e_", "a_")
                          ).toDF()
    except ValueError as exp:
        if str(exp) == "RDD is empty":
            # create empty result set
            df_variance = _get_empty_result_df(
                df_expected_with_renamed_columns, prime_columns)
        else:
            # raise the same exception, when ValueError message is different
            raise exp
    return df_variance
Ejemplo n.º 27
0
from models.Kmeans import KMeansModelCustom
from models.GbtModel import GBTModelCustom
from pyspark.sql import SparkSession

k_means = KMeansModelCustom(True)
gbt = GBTModelCustom(True)
ss = SparkSession.getActiveSession()
def generate_data2(table_name="my_data"):
    df = SparkSession.getActiveSession().range(0, 10)
    df.write.format("delta").mode("overwrite").saveAsTable(table_name)