Beispiel #1
0
 def tearDownClass(cls):
     # Import needs to happen during EVERY setup to ensure that we are
     # using the most recently reloaded SmvApp
     from smv.smvapp import SmvApp
     # Restore SmvApp singleton
     SmvApp.setInstance(TestConfig.originalSmvApp())
     sys.path.remove(cls.resourceTestDir())
Beispiel #2
0
 def tearDownClass(cls):
     # Import needs to happen during EVERY setup to ensure that we are
     # using the most recently reloaded SmvApp
     from smv.smvapp import SmvApp
     # Restore SmvApp singleton
     SmvApp.setInstance(TestConfig.originalSmvApp())
     sys.path.remove(cls.resourceTestDir())
Beispiel #3
0
def FailAny():
    """Any rule fail or fix with FailAny will cause the entire DF to fail

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.DqmTaskPolicies.failAny()
Beispiel #4
0
def Quarter(year, quarter):
    """Define an smv.panel.Quarter

        Quarter extends smv.panel.PartialTime base class

        Args:
            year (int)
            quarter (int):

        Example:

            >>> q = Quarter(2012, 1)
            >>> q.smvTime()
            u'Q201201'
            >>> q.timeIndex()
            168
            >>> q.timeLabel()
            u'2012-Q1'
            >>> q.timeType()
            u'quarter'

        Returns:
            (java object smv.panel.Quarter)
    """
    return SmvApp.getInstance()._jvm.Quarter(year, quarter)
Beispiel #5
0
def Day(year, month, day):
    """Define an smv.panel.Day

        Day extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):
            day (int):

        Example:

            >>> d = Day(2012, 5, 31)
            >>> d.smvTime()
            u'D20120531'
            >>> d.timeIndex()
            15491
            >>> d.timeLabel()
            u'2012-05-31'
            >>> d.timeType()
            u'day'

        Returns:
            (java object smv.panel.Day)
    """
    return SmvApp.getInstance()._jvm.Day(year, month, day)
Beispiel #6
0
def smvStrCat(head, *others):
    """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently
       when some columns are nulls.
       The Spark version will return null if any of the inputs is null.
       smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank.

       This function can take 2 forms:
       - smvStrCat(sep, col1, col2, ...)
       - smvStrCat(col1, col2, ...)

       Args:
           sep (String): separater for the concats
           col. (Column): columns to be concatenated

       Return:
           (col): a StringType column
    """
    if is_string(head):
        sep = head
        cols = list(others)
    elif isinstance(head, Column):
        sep = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError("first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat(sep, smv_copy_array(app.sc, *cols)))
Beispiel #7
0
def FailNone():
    """Tasks with FailNone will not trigger any DF level policy

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.DqmTaskPolicies.failNone()
Beispiel #8
0
def Day(year, month, day):
    """Define an smv.panel.Day

        Day extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):
            day (int):

        Example:

            >>> d = Day(2012, 5, 31)
            >>> d.smvTime()
            u'D20120531'
            >>> d.timeIndex()
            15491
            >>> d.timeLabel()
            u'2012-05-31'
            >>> d.timeType()
            u'day'

        Returns:
            (java object smv.panel.Day)
    """
    return SmvApp.getInstance()._jvm.Day(year, month, day)
Beispiel #9
0
def Quarter(year, quarter):
    """Define an smv.panel.Quarter

        Quarter extends smv.panel.PartialTime base class

        Args:
            year (int)
            quarter (int):

        Example:

            >>> q = Quarter(2012, 1)
            >>> q.smvTime()
            u'Q201201'
            >>> q.timeIndex()
            168
            >>> q.timeLabel()
            u'2012-Q1'
            >>> q.timeType()
            u'quarter'

        Returns:
            (java object smv.panel.Quarter)
    """
    return SmvApp.getInstance()._jvm.Quarter(year, quarter)
Beispiel #10
0
    def setUpClass(cls):
        # Import needs to happen during EVERY setup to ensure that we are
        # using the most recently reloaded SmvApp
        from smv.smvapp import SmvApp

        cls.sparkContext = TestConfig.sparkContext()
        cls.sqlContext = TestConfig.sqlContext()
        cls.sparkContext.setLogLevel("ERROR")

        import random
        callback_server_port = random.randint(20000, 65535)

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + [
            '--cbs-port',
            str(callback_server_port), '--data-dir',
            cls.tmpDataDir()
        ]
        # The test's SmvApp must be set as the singleton for correct results of some tests
        # The original SmvApp (if any) will be restored when the test is torn down
        cls.smvApp = SmvApp.createInstance(args, cls.sparkContext,
                                           cls.sqlContext)

        sys.path.append(cls.testResourceDir())

        cls.mkTmpTestDir()
Beispiel #11
0
def FailNone():
    """Tasks with FailNone will not trigger any DF level policy

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.DqmTaskPolicies.failNone()
Beispiel #12
0
def Month(year, month):
    """Define an smv.panel.Month

        Month extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):

        Example:

            >>> m = Month(2012, 5)
            >>> m.smvTime()
            u'M201205'
            >>> m.timeIndex()
            508
            >>> m.timeLabel()
            u'2012-05'
            >>> m.timeType()
            u'month'

        Returns:
            (java object smv.panel.Month)
    """
    return SmvApp.getInstance()._jvm.Month(year, month)
Beispiel #13
0
def FailAny():
    """Any rule fail or fix with FailAny will cause the entire DF to fail

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.DqmTaskPolicies.failAny()
Beispiel #14
0
def Month(year, month):
    """Define an smv.panel.Month

        Month extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):

        Example:

            >>> m = Month(2012, 5)
            >>> m.smvTime()
            u'M201205'
            >>> m.timeIndex()
            508
            >>> m.timeLabel()
            u'2012-05'
            >>> m.timeType()
            u'month'

        Returns:
            (java object smv.panel.Month)
    """
    return SmvApp.getInstance()._jvm.Month(year, month)
Beispiel #15
0
def FailCount(threshold):
    """Tasks with FailCount(n) will fail the DF if the task is triggered >= n times

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.FailCount(threshold)
Beispiel #16
0
def smvCollectSet(col, datatype):
    """An aggregate function, which will collect all the values of the given column and create a set as an array typed column.
       Since Spark 1.6, a spark function collect_set was introduced, so as migrate to Spark 1.6 and later, this smvCollectSet
       will be depricated.

       Args:
            col (Column): column to be aggregated on
            datatype (DataType): datatype of the input column
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvCollectSet(col._jc, datatype.json()))
Beispiel #17
0
def FailTotalFixCountPolicy(threshold):
    """For all the fixes in a DQM, if the total number of times they are triggered is >= threshold, fail the DF

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailTotalFixCountPolicy(threshold)
Beispiel #18
0
def FailParserCountPolicy(threshold):
    """If the total time of parser fails >= threshold, fail the DF

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailParserCountPolicy(threshold)
Beispiel #19
0
def FailParserCountPolicy(threshold):
    """If the total time of parser fails >= threshold, fail the DF

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailParserCountPolicy(threshold)
Beispiel #20
0
def FailTotalFixPercentPolicy(threshold):
    """For all the fixes in a DQM, if the total number of times they are triggered is >= threshold * total Records, fail the DF

        Args:
            threshold (double): the threshold after which the DF fails. value is between 0.0 and 1.0

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailTotalFixPercentPolicy(threshold * 1.0)
Beispiel #21
0
def FailTotalFixCountPolicy(threshold):
    """For all the fixes in a DQM, if the total number of times they are triggered is >= threshold, fail the DF

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailTotalFixCountPolicy(threshold)
Beispiel #22
0
def FailCount(threshold):
    """Tasks with FailCount(n) will fail the DF if the task is triggered >= n times

        Args:
            threshold (int): the threshold after which the DF fails

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.FailCount(threshold)
Beispiel #23
0
def FailTotalFixPercentPolicy(threshold):
    """For all the fixes in a DQM, if the total number of times they are triggered is >= threshold * total Records, fail the DF

        Args:
            threshold (double): the threshold after which the DF fails. value is between 0.0 and 1.0

        Returns:
            (DQMPolicy): policy for DQM
    """
    return SmvApp.getInstance()._jvm.FailTotalFixPercentPolicy(threshold * 1.0)
Beispiel #24
0
def nGram3(c1, c2):
    """3-gram UDF with formula (number of overlaped gramCnt)/max(s1.gramCnt, s2.gramCnt)

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): 3-gram
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.nGram3(c1._jc, c2._jc))
Beispiel #25
0
def normlevenshtein(c1, c2):
    """Levenshtein edit distance metric UDF

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): distances
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.normlevenshtein(c1._jc, c2._jc))
Beispiel #26
0
def jaroWinkler(c1, c2):
    """Jaro-Winkler edit distance metric UDF

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): distances
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.jaroWinkler(c1._jc, c2._jc))
Beispiel #27
0
def FailPercent(threshold):
    """Tasks with FailPercent(r) will fail the DF if the task is triggered >= r percent of the
        total number of records in the DF

        Args:
            threshold (double): the threshold after which the DF fails. value is between 0.0 and 1.0

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.FailPercent(threshold * 1.0)
Beispiel #28
0
def diceSorensen(c1, c2):
    """2-gram UDF with formula (2 * number of overlaped gramCnt)/(s1.gramCnt + s2.gramCnt)

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): 2-gram
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.diceSorensen(c1._jc, c2._jc))
Beispiel #29
0
def FailPercent(threshold):
    """Tasks with FailPercent(r) will fail the DF if the task is triggered >= r percent of the
        total number of records in the DF

        Args:
            threshold (double): the threshold after which the DF fails. value is between 0.0 and 1.0

        Returns:
            (DQMTaskPolicy): policy for DQM Task
    """
    return SmvApp.getInstance()._jvm.FailPercent(threshold * 1.0)
Beispiel #30
0
def smvArrayCat(sep, col):
    """For an array typed column, concat the elements to a string with the given separater.

       Args:
            sep: a Python string to separate the fields
            col: a Column with ArrayType

       Return:
            (col): a Column in StringType with array elements concatenated
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.smvArrayCat(sep, col._jc))
Beispiel #31
0
    def setUp(self):
        """Patch for Python 2.6 without using unittest
        """
        from smv import SmvApp
        cls = self.__class__
        if not hasattr(cls, 'smvApp'):
            cls.sparkSession = TestConfig.sparkSession()
            cls.sparkContext = TestConfig.sparkContext()
            cls.sparkContext.setLogLevel("ERROR")

            args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--data-dir', cls.tmpDataDir()]
            cls.smvApp = SmvApp.createInstance(args, cls.sparkSession)
Beispiel #32
0
    def setUpClass(cls):
        from smv.smvapp import SmvApp
        from test_support.testconfig import TestConfig

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--data-dir', cls.tmpDataDir()]
        # The test's SmvApp must be set as the singleton for correct results of some tests
        # The original SmvApp (if any) will be restored when the test is torn down
        cls.smvApp = SmvApp.createInstance(args, None)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #33
0
    def setUpClass(cls):
        from smv.smvapp import SmvApp

        cls.sparkSession = TestConfig.sparkSession()
        cls.sparkContext = TestConfig.sparkContext()
        cls.sparkContext.setLogLevel("ERROR")

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--data-dir', cls.tmpDataDir()]
        # set py_module_hotload flag to False so no reload of python files
        cls.smvApp = SmvApp.createInstance(args, cls.sparkSession, py_module_hotload=False)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #34
0
    def setUpClass(cls):
        from smv.smvapp import SmvApp
        from test_support.testconfig import TestConfig

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + [
            '--data-dir', cls.tmpDataDir()
        ]
        # The test's SmvApp must be set as the singleton for correct results of some tests
        # The original SmvApp (if any) will be restored when the test is torn down
        cls.smvApp = SmvApp.createInstance(args, None)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #35
0
def smvFirst(c, nonNull = False):
    """Variation of Spark "first" which also returns null values

        Since Spark "first" will return the first non-null value, we have to
        create our version smvFirst which to retune the real first value, even
        if it's null. Alternatively can return the first non-null value.

        Args:
            c (Column: column to extract first value from
            nonNull (bool): If false, return first value even if null. If true, return first non-null value. Defaults to false.

        Returns:
            (object): first value
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.smvFirst(c._jc, nonNull))
Beispiel #36
0
    def setUp(self):
        """Patch for Python 2.6 without using unittest
        """
        from smv import SmvApp
        cls = self.__class__
        if not hasattr(cls, 'smvApp'):
            cls.sparkContext = TestConfig.sparkContext()
            cls.sqlContext = TestConfig.sqlContext()
            cls.sparkContext.setLogLevel("ERROR")

            import random;
            callback_server_port = random.randint(20000, 65535)

            args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--cbs-port', str(callback_server_port)]
            cls.smvApp = SmvApp.createInstance(args, cls.sparkContext, cls.sqlContext)
Beispiel #37
0
    def setUpClass(cls):
        # Import needs to happen during EVERY setup to ensure that we are
        # using the most recently reloaded SmvApp
        from smv.smvapp import SmvApp

        cls.sparkSession = TestConfig.sparkSession()
        cls.sparkContext = TestConfig.sparkContext()
        cls.sparkContext.setLogLevel("ERROR")

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--data-dir', cls.tmpDataDir()]
        # The test's SmvApp must be set as the singleton for correct results of some tests
        # The original SmvApp (if any) will be restored when the test is torn down
        cls.smvApp = SmvApp.createInstance(args, cls.sparkSession)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #38
0
    def setUp(self):
        """Patch for Python 2.6 without using unittest
        """
        from smv import SmvApp
        cls = self.__class__
        if not hasattr(cls, 'smvApp'):
            cls.sparkContext = TestConfig.sparkContext()
            cls.sqlContext = TestConfig.sqlContext()
            cls.sparkContext.setLogLevel("ERROR")

            import random
            callback_server_port = random.randint(20000, 65535)

            args = TestConfig.smv_args() + cls.smvAppInitArgs() + [
                '--cbs-port', str(callback_server_port)
            ]
            cls.smvApp = SmvApp.createInstance(args, cls.sparkContext,
                                               cls.sqlContext)
Beispiel #39
0
    def setUpClass(cls):
        from smv.smvapp import SmvApp

        cls.sparkSession = TestConfig.sparkSession()
        cls.sparkContext = TestConfig.sparkContext()
        cls.sparkContext.setLogLevel("ERROR")

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + [
            '--data-dir', cls.tmpDataDir()
        ]
        # set py_module_hotload flag to False so no reload of python files
        cls.smvApp = SmvApp.createInstance(args,
                                           cls.sparkSession,
                                           py_module_hotload=False)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #40
0
def TimePanel(start, end):
    """Define an smv.panel.TimePanel

         TimePanel is a consecutive range of PartialTimes
         It has a "start" PartialTime and "end" PartialTime, both are inclusive.
         "start" and "end" have to have the same timeType

        Args:
            start (java object smv.PartialTime): Quarter, Month, Day etc.
            end (java object smv.PartialTime): Quarter, Month, Day etc.

        Example:

            >>> tp = TimePanel(Day(2012, 1, 1), Day(2013, 12, 31))

        Returns:
            (java object smv.panel.TimePanel)
    """
    return SmvApp.getInstance()._jvm.TimePanel(start, end)
Beispiel #41
0
def TimePanel(start, end):
    """Define an smv.panel.TimePanel

         TimePanel is a consecutive range of PartialTimes
         It has a "start" PartialTime and "end" PartialTime, both are inclusive.
         "start" and "end" have to have the same timeType

        Args:
            start (java object smv.PartialTime): Quarter, Month, Day etc.
            end (java object smv.PartialTime): Quarter, Month, Day etc.

        Example:

            >>> tp = TimePanel(Day(2012, 1, 1), Day(2013, 12, 31))

        Returns:
            (java object smv.panel.TimePanel)
    """
    return SmvApp.getInstance()._jvm.TimePanel(start, end)
Beispiel #42
0
def DQMRule(rule, name=None, taskPolicy=None):
    """DQMRule defines a requirement on the records of a DF

        Example:

            Require the sum of "a" and "b" columns less than 100

            >>> DQMRule(col('a') + col('b') < 100.0, 'a_b_sum_lt100', FailPercent(0.01))

        Args:
            rule (Column): boolean condition that defines the requirement on the records of a DF
            name (string): optional parameter for naming the DQMRule. if not specified, defaults to the rule text
            taskPolicy (DQMTaskPolicy): optional parameter for the DQM policy. if not specified, defaults to FailNone()

        Returns:
            (DQMRule): a DQMRule object
    """
    task = taskPolicy or FailNone()
    return SmvApp.getInstance()._jvm.DQMRule(rule._jc, name, task)
Beispiel #43
0
def DQMRule(rule, name = None, taskPolicy = None):
    """DQMRule defines a requirement on the records of a DF

        Example:

            Require the sum of "a" and "b" columns less than 100

            >>> DQMRule(col('a') + col('b') < 100.0, 'a_b_sum_lt100', FailPercent(0.01))

        Args:
            rule (Column): boolean condition that defines the requirement on the records of a DF
            name (string): optional parameter for naming the DQMRule. if not specified, defaults to the rule text
            taskPolicy (DQMTaskPolicy): optional parameter for the DQM policy. if not specified, defaults to FailNone()

        Returns:
            (DQMRule): a DQMRule object
    """
    task = taskPolicy or FailNone()
    return SmvApp.getInstance()._jvm.DQMRule(rule._jc, name, task)
Beispiel #44
0
def smvHashKey(head, *others):
    """Create MD5 on concatenated columns.
    Return "Prefix" + MD5 Hex string(size 32 string) as the unique key

    MD5's collisions rate on real data records could be ignored based on the following discussion.

    https://marc-stevens.nl/research/md5-1block-collision/
    The shortest messages have the same MD5 are 512-bit (64-byte) messages as below

    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2
    and the (different by two bits)
    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2
    both have MD5 hash
    008ee33a9d58b51cfeb425b0959121c9

    There are other those pairs, but all carefully constructed.
    Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has
    128-bit), which is much larger than the number of records we deal with (a billion is about 2^30)
    There for using MD5 to hash primary key columns is good enough for creating an unique key

    This function can take 2 forms:
    - smvHashKey(prefix, col1, col2, ...)
    - smvHashKey(col1, col2, ...)

    Args:
     prefix (String): return string's prefix
     col. (Column): columns to be part of hash

    Return:
     (col): a StringType column as Prefix + MD5 Hex string
    """

    if is_string(head):
        pre = head
        cols = list(others)
    elif isinstance(head, Column):
        pre = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError("first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey(pre, smv_copy_array(app.sc, *cols)))
Beispiel #45
0
def DQMFix(condition, fix, name=None, taskPolicy=None):
    """DQMFix will fix a column with a default value

        Example:

            If "age" greater than 100, make it 100

            >>> DQMFix(col('age') > 100, lit(100).alias('age'), 'age_cap100', FailNone)

        Args:
            condition (Column): boolean condition that determines when the fix should occur on the records of a DF
            fix (Column): the fix to use when replacing a value that does not pass the condition
            name (String): optional parameter for naming the DQMFix. if not specified, defaults to the condition text
            taskPolicy (DQMTaskPolicy): optional parameter for the DQM policy. if not specified, defaults to FailNone()

        Returns:
            (DQMFix): a DQMFix object
    """
    task = taskPolicy or FailNone()
    return SmvApp.getInstance()._jvm.DQMFix(condition._jc, fix._jc, name, task)
Beispiel #46
0
    def setUpClass(cls):
        # Import needs to happen during EVERY setup to ensure that we are
        # using the most recently reloaded SmvApp
        from smv.smvapp import SmvApp

        cls.sparkContext = TestConfig.sparkContext()
        cls.sqlContext = TestConfig.sqlContext()
        cls.sparkContext.setLogLevel("ERROR")

        import random;
        callback_server_port = random.randint(20000, 65535)

        args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--cbs-port', str(callback_server_port), '--data-dir', cls.tmpDataDir()]
        # The test's SmvApp must be set as the singleton for correct results of some tests
        # The original SmvApp (if any) will be restored when the test is torn down
        cls.smvApp = SmvApp.createInstance(args, cls.sparkContext, cls.sqlContext)

        sys.path.append(cls.resourceTestDir())

        cls.mkTmpTestDir()
Beispiel #47
0
def DQMFix(condition, fix, name = None, taskPolicy = None):
    """DQMFix will fix a column with a default value

        Example:

            If "age" greater than 100, make it 100

            >>> DQMFix(col('age') > 100, lit(100).alias('age'), 'age_cap100', FailNone)

        Args:
            condition (Column): boolean condition that determines when the fix should occur on the records of a DF
            fix (Column): the fix to use when replacing a value that does not pass the condition
            name (String): optional parameter for naming the DQMFix. if not specified, defaults to the condition text
            taskPolicy (DQMTaskPolicy): optional parameter for the DQM policy. if not specified, defaults to FailNone()

        Returns:
            (DQMFix): a DQMFix object
    """
    task = taskPolicy or FailNone()
    return SmvApp.getInstance()._jvm.DQMFix(condition._jc, fix._jc, name, task)
Beispiel #48
0
def Week(year, month, day, start_on = "Monday"):
    """Define an smv.panel.Week

        Week extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):
            day (int):
            start_on (str): Week starts on, valid values: Monday, Tuesday, Wednesday,
                Thursday, Friday, Saturday, Sunday. Default value is Monday

        Example:
            >>> w = Week(2012, 3, 4)
            >>> w.smvTime()
            u'W20120227'
            >>> w.timeIndex()
            2200
            >>> w.timeLabel()
            u'Week of 2012-02-27'
            >>> w.timeType()
            u'week'

            >>> w = Week(2012, 3, 4, "Sunday")
            >>> w.timeType()
            u'week_start_on_Sunday'
            >>> w.smvTime()
            u'W(7)20120304'
            >>> w.timeIndex()
            2201
            >>> w.timeLabel()
            u'Week of 2012-03-04'

        Returns:
            (java object smv.panel.Week)
    """
    return SmvApp.getInstance()._jvm.Week(year, month, day, start_on)
Beispiel #49
0
def Week(year, month, day, start_on="Monday"):
    """Define an smv.panel.Week

        Week extends smv.panel.PartialTime base class

        Args:
            year (int):
            month (int):
            day (int):
            start_on (str): Week starts on, valid values: Monday, Tuesday, Wednesday,
                Thursday, Friday, Saturday, Sunday. Default value is Monday

        Example:
            >>> w = Week(2012, 3, 4)
            >>> w.smvTime()
            u'W20120227'
            >>> w.timeIndex()
            2200
            >>> w.timeLabel()
            u'Week of 2012-02-27'
            >>> w.timeType()
            u'week'

            >>> w = Week(2012, 3, 4, "Sunday")
            >>> w.timeType()
            u'week_start_on_Sunday'
            >>> w.smvTime()
            u'W(7)20120304'
            >>> w.timeIndex()
            2201
            >>> w.timeLabel()
            u'Week of 2012-03-04'

        Returns:
            (java object smv.panel.Week)
    """
    return SmvApp.getInstance()._jvm.Week(year, month, day, start_on)
Beispiel #50
0
 def _smvGetRunConfigHash(self):
     """return the app level hash of the all the current user config values"""
     return SmvApp.getInstance().j_smvPyClient.getRunConfigHash()
Beispiel #51
0
 def smvGetRunConfig(self, key):
     """return the current user run configuration value for the given key."""
     return SmvApp.getInstance().j_smvPyClient.getRunConfig(key)