Beispiel #1
0
    def test_collision_same_words(self):
        """
        The words are all the same
        """

        sc = ScManager.get()

        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word='abcdabcdabcdabcd')
        sax, _, _ = sax_result.start_sax(4, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=6,
                                           index_selected=2,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        result = result.data

        # exactly the same words => six cells of maximum of combinations
        nb_cell = 0
        for i in result:
            if i[0] == 6:
                nb_cell += 1
        self.assertEqual(nb_cell, 6)
Beispiel #2
0
    def test_sliding_window_sax_basic(self):
        """
        Test the nominal case
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=3)

        spark_ctx = ScManager.get()
        result, _ = sliding_windows(ts_list=["linear_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        # recovery = 0.5 and word_size = 3 => sax_result = 'aab abc bcc'
        self.assertEqual(sax_result.sax_word, 'aababcbcc')
Beispiel #3
0
    def test_sw_sax_limit_constant(self):
        """
        Test sliding window and SAX on a constant timeseries with two greater values
        """
        sax_info = ConfigSax(paa=10,
                             sequences_size=10,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=5)

        spark_ctx = ScManager.get()

        result, _ = sliding_windows(ts_list=["specific_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        print("result={}".format(result.collect()))

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        print("sax_word={}".format(sax_result.sax_word))
        # PAA_value = 0 => 'c'
        # PAA_value = 10 => 'e' or 'd'
        # PAA_value = -10 => 'a' or 'b'
        self.assertTrue(sax_result.sax_word is 'ccccccccae'
                        or sax_result.sax_word is 'ccccccccbd')
Beispiel #4
0
    def test_coll_various_words(self):
        """
        Test the collision matrix for same and different words
        The words 0 and 3 are the same, the words 1 and 2 too
        """

        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word=''.join(
                                   ['ababa', 'cdcdc', 'cdcdc', 'ababa']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)
        # the maximum of possible combinations without repetitions is 10
        # two cells of 10 : one for the occurrences between the words 1 and 2, and another for the words 0 and 3
        for i in range(2):
            self.assertTrue(result[i][0] == 10)
        self.assertTrue(
            int(result[0][1][0]) == 2 and int(result[0][1][1]) == 1)
        self.assertTrue(
            int(result[1][1][0]) == 3 and int(result[1][1][1]) == 0)
    def _apply_motif_iter_zero_coll(self, activate_spark):
        """
        Test
         - with the iterative method to search the neighborhood motif,
         - with/without spark jobs
         - and where the words are all different => no collisions
        """
        spark_context = ScManager.get()
        # Build the SAX result with different words, and small breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-0.3, -0.1, 0.1, 0.3],
                               sax_word='abcdebcdeacdeabdeabceabcd')
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(nb_seq)

        # Different words => only zero cells in the collision matrix
        collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq)))

        # Build the class for motif search
        search_info = NeighborhoodSearch(size_sequence=20,
                                         mindist_lookup_table=breakpoint,
                                         alphabet_size=5,
                                         sax=np.transpose(sax),
                                         radius=1000,
                                         collision_matrix=collision_matrix)

        recognition_info = ConfigRecognition(
            is_stopped_by_eq9=True,
            iterations=100,
            min_value=1,
            is_algo_method_global=False,
            activate_spark=activate_spark,
            radius=1000,
            neighborhood_method=OPT_USING_BRUTE_FORCE)

        # neighborhood_method=OPT_USING_BRUTE_FORCE
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)

        # neighborhood_method=OPT_USING_COLLISIONS
        recognition_info.neighborhood_method = OPT_USING_COLLISIONS
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)
Beispiel #6
0
    def __init__(self, tdm, ts_load_split_size=10):
        """
        init the spark distance class

        :param tdm: the temporal data manager client
        :type tdm: TemporalDataMgr

        :param ts_load_split_size: size of TS packet to load from TDM
        :type ts_load_split_size: int

        """

        self.tdm = tdm
        self.ts_load_split_size = ts_load_split_size
        self.spark_context = ScManager.get()

        self.logger = logging.getLogger(__name__)
Beispiel #7
0
    def _run_all_in_master_memory(self, method):
        """
        Run the spark pearson correlation by loading all the TS content (ie. values) in master memory

        Each coefficient will be computed by a worker (Spark decides the best choice to apply)
        """

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Get TS content
        rdd_content = self._get_ts(spark_context)

        # Job distribution is made by Statistics.corr (Spark correlation matrix calculation)
        self.results = Statistics.corr(rdd_content, method=method)

        ScManager.stop()
Beispiel #8
0
    def test_coll_near_same_words(self):
        """
        The words have 1, or 2, or 3, or 4 occurrences, but there are not exactly the same because words have five
        letters
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['aaaaa', 'abbbb', 'abccc', 'abcdd', 'abcde']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        # sorted result list
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)

        # sorted list expected:
        expected_result = [(1.0, (2, 1)), (1.0, (3, 1)), (3.0, (3, 2)),
                           (1.0, (4, 1)), (3.0, (4, 2)), (6.0, (4, 3))]
        expected_result.sort(key=lambda x: "{}-{}-{}".format(
            int(x[0]), int(x[1][0]), int(x[1][1])))

        self.assertEqual(len(result), len(expected_result))
        for expected_item, res_item in zip(expected_result, result):
            self.assertEqual(expected_item[0], res_item[0], 'nb collisions')
            self.assertEqual(expected_item[1][0], res_item[1][0],
                             'seq index left-side')
            self.assertEqual(expected_item[1][1], res_item[1][1],
                             'seq index right-side')
Beispiel #9
0
    def test_sliding_window_filter(self):
        """
        Testing linear filter.
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=True,
                             recovery=0.5,
                             coefficients=[1, 0.5],
                             alphabet_size=6)

        spark_ctx = ScManager.get()
        # Test for linear sequences
        result, _ = sliding_windows(ts_list=["linear_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        result = result.collect()
        # all sequences are linear => no sequence
        self.assertEqual(len(result), 0)

        # Test for constant sequences with a maximum recovery (= 0 => no overlap between sequences)
        sax_info.coefficients = [0, 1]
        sax_info.recovery = 0
        result, _ = sliding_windows(ts_list=["ts_with_constant_pattern"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)
        result = result.collect()
        LOGGER.info("result=%s", result)
        LOGGER.info("ts_init=%s", get_ts_mock("ts_with_constant_pattern"))
        # Sequence of 12 pts, recovery = 0 (no recovery) -> 2 sequences
        self.assertEqual(len(result), 2)
Beispiel #10
0
    def test_collision_different_words(self):
        """
        The words are all different
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data

        # different words => only zero cells in the matrix
        self.assertTrue(len(result) is 0)
    def _apply_motif_global_coll_ex1(self, activate_spark):
        """
        Test
          - with the global method to search the neighborhood motif,
          - with/without spark according to activate_spark
          - exploring similarities with collisions heuristic
          - with input: the words have only one different letter.  And every sequence
            Si has collisions with Sj with that matrix.

         Note: results ought to be equal to test_global_brute_no_spark_ex1
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix (the real collision matrix is different, but we take this one for the test)
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                30,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                50,
                0,
                0,
            ], [
                6,
                10,
                20,
                60,
                0,
            ]]))

        self._print_matrix("test_global_coll_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_COLLISIONS
        #
        #  for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25
        #  and with the collisions heuristic: only sequences having collisions with Si or Sj are examined
        #
        # for radius 1.9  => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]]
        #
        # for radius 2.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 3.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 6    => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4]]
        #
        for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [
                                         1.9,
                                         [[0, 1, 2], [0, 1, 2, 3, 4],
                                          [1, 2, 3, 4], [2, 3, 4]]
                                     ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [6, [[0, 1, 2, 3, 4]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=25,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            print("radius {}:expected:                 {}".format(
                radius, expected_res))
            result = search_info.motif_neighborhood_global(
                recognition_info.min_value, recognition_info)

            print("radius {}:->global with collisions: {}".format(
                radius, result))

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
Beispiel #12
0
def run_paa_from_ts_list(tdm,
                         ts_list,
                         paa_size,
                         out_ts=True,
                         save=False,
                         activate_spark=None):
    """
    Compute the Piecewise Aggregation Approximation (PAA) on the **ts_list** provided
    Use spark if necessary

    :param tdm: temporal data manager object
    :type tdm: TemporalDataMgr

    :param ts_list: tsuid list of the TS to calculate the PAA timeseries
    :type ts_list: list

    :param paa_size: number of segments
    :type paa_size: int

    :param out_ts: True means the result will be a TS, False will return only the means
    :type out_ts: bool

    :param save: True means the new TS will be saved in addition of the return
    :type save: bool

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or None

    :return: the array of the new TS resulting of the PAA approximation or the list of values (with len = paa_size)
    :rtype: list
    """

    results = {}

    # Define if spark is necessary
    if activate_spark is None:

        md = tdm.get_meta_data(ts_list)
        sum_points = 0
        for tsuid in md:
            if 'qual_nb_points' in md[tsuid]:
                sum_points += float(md[tsuid]['qual_nb_points'])
            else:
                # No information on number of points, consider using spark
                sum_points = 0
                break
        spark_nb_points_trigger = 1E5
        if sum_points == 0 or sum_points / len(
                ts_list) > spark_nb_points_trigger:
            # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points
            activate_spark = True

    if activate_spark:
        LOGGER.info("Running PAA using Spark")

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Build the RDD with TSUIDS
        rdd = spark_context.parallelize(ts_list)

        # Create a broadcast for spark jobs
        broadcast = spark_context.broadcast({
            "host": tdm.host,
            "port": tdm.port,
            "paa_size": paa_size,
            "out_ts": out_ts,
            "save": save,
        })

        # Create an accumulator to store the results of the spark workers
        accumulator = spark_context.accumulator(dict(), ListAccumulatorParam())

        def run_paa_spark(working_tsuid):
            """
            Method called by spark job

            :param working_tsuid: rdd item
            """

            spark_tdm = TemporalDataMgr(host=broadcast.value['host'],
                                        port=broadcast.value['port'])

            # noinspection PyBroadException
            try:
                results = run_paa_from_tsuid(
                    tdm=spark_tdm,
                    tsuid=working_tsuid,
                    paa_size=broadcast.value['paa_size'],
                    out_ts=broadcast.value['out_ts'],
                    save=broadcast.value['save'])[:]
            except Exception:
                results = []

            accumulator.add({working_tsuid: results})

        # Get TS content using spark distribution to increase performance
        # noinspection PyBroadException
        try:
            rdd.foreach(run_paa_spark)
        except Exception:
            LOGGER.warning(
                'Something wrong with spark, Using Local Computation')
            activate_spark = False

        for ts in ts_list:
            if ts in accumulator.value:
                results[ts] = accumulator.value[ts]
            else:
                LOGGER.warning(
                    "TS %s has encountered an issue during the spark distribution",
                    ts)

        ScManager.stop()

    if not activate_spark:
        LOGGER.info("Running PAA on single instance")
        for ts in ts_list:
            results[ts] = run_paa_from_tsuid(tdm=tdm,
                                             tsuid=ts,
                                             paa_size=paa_size,
                                             out_ts=out_ts,
                                             save=save)

    return results
Beispiel #13
0
    def test_sax(self):
        """
        Test with no calculate the PAA (4 PAA for 4 points in a sequence) and the PAA are equidistants
        """
        sax_info = ConfigSax(paa=4,
                             sequences_size=4,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=4)
        spark_ctx = ScManager.get()
        result, _ = sliding_windows(
            ts_list=["simple_sequences_ts0", "simple_sequences_ts1"],
            sax_info=sax_info,
            spark_ctx=spark_ctx)

        LOGGER.info("sliding_windows done!")

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        result = result.collect()
        LOGGER.info("sax_result=%s", sax_result)
        LOGGER.info("result=%s", result)

        # the PAA : [[4, 4, 0, 2], [-2, 2, -2, 0]]
        self.assertEqual(sax_result.paa.collect(), [4, 4, 0, 2, -2, 2, -2, 0])
        # the result expected : 'ddbc acab'
        self.assertEqual(sax_result.sax_word, 'ddbcacab')

        # Test with calculate the PAA
        sax_info = ConfigSax(paa=4,
                             sequences_size=12,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=4)

        result, _ = sliding_windows(
            ts_list=["sequences_1_ts0", "sequences_1_ts1"],
            sax_info=sax_info,
            spark_ctx=spark_ctx)

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        # the PAA : [[1, 4, -2, 1], [4, -2, -3, -3]]
        self.assertEqual(sax_result.paa.collect(),
                         [1, 4, -2, 1, 4, -2, -3, -3])
        # the result expected : 'cdbc dbaa'
        self.assertEqual(sax_result.sax_word, 'cdbcdbaa')
Beispiel #14
0
def run_sax_from_ts_list(ts_list,
                         alphabet_size,
                         word_size,
                         normalize=False,
                         activate_spark=None):
    """
    Perform the Symbolic Aggregate Approximation (SAX) on the TSUID list provided in **ts_list**

    Use spark if necessary

    .. note::
        If spark fails. The local computation will be performed

    :param ts_list: tsuid list of the TS to calculate the PAA timeseries
    :type ts_list: list

    :param alphabet_size: number of characters in result word
    :type alphabet_size: int

    :param word_size: number of segments
    :type word_size: int

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or none

    :param normalize: Apply the normalization of the TS if True (False:default)
    :type normalize: bool

    :return: A list of dict composed of the PAA result, the SAX breakpoints, the SAX string and the points for all TSUID
    :rtype: list
    """

    results = {}

    # Define if spark is necessary
    if activate_spark is None:

        md = IkatsApi.md.read(ts_list=ts_list)
        sum_points = 0
        for tsuid in md:
            if 'qual_nb_points' in md[tsuid]:
                sum_points += float(md[tsuid]['qual_nb_points'])
            else:
                # No information on number of points, consider using spark
                sum_points = 0
                break
        spark_nb_points_trigger = 1E5
        if sum_points == 0 or sum_points / len(
                ts_list) > spark_nb_points_trigger:
            # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points
            activate_spark = True

    if activate_spark:
        LOGGER.info("Running SAX using Spark")

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Build the RDD with TSUIDS
        rdd = spark_context.parallelize(ts_list)

        # Create a broadcast for spark jobs
        broadcast = spark_context.broadcast({
            "alphabet_size": alphabet_size,
            "word_size": word_size,
            "normalize": normalize,
        })

        # Create an accumulator to store the results of the spark workers
        accumulator = spark_context.accumulator(dict(), ListAccumulatorParam())

        def run_sax_spark(working_tsuid):
            """
            Method called by spark job

            :param working_tsuid: rdd item
            """

            results = run_sax_from_tsuid(
                tsuid=working_tsuid,
                alphabet_size=broadcast.value['alphabet_size'],
                word_size=broadcast.value['word_size'],
                normalize=broadcast.value['normalize'])

            accumulator.add({working_tsuid: results})

        # Get TS content using spark distribution to increase performance
        # noinspection PyBroadException
        try:
            rdd.foreach(run_sax_spark)
        except Exception:
            LOGGER.warning(
                'Something wrong with spark, Using Local Computation')
            activate_spark = False

        for ts in ts_list:
            if ts in accumulator.value:
                results[ts] = accumulator.value[ts]
            else:
                LOGGER.warning(
                    "TS %s has encountered an issue during the spark distribution",
                    ts)

        ScManager.stop()

    if not activate_spark:
        LOGGER.info("Running SAX on single instance")

        for ts in ts_list:
            results[ts] = run_sax_from_tsuid(tsuid=ts,
                                             alphabet_size=alphabet_size,
                                             word_size=word_size,
                                             normalize=normalize)

            # print("TS=%s\nnorm=%s\nr=%s\n\n"%(ts,normalize,results[ts]['sax_breakpoints'][0]))

    return results
Beispiel #15
0
    def test_sliding_window_norm(self):
        """
        Testing global and local norm.
        """
        epsilon = 1.0e-10
        # recovery = 1 (no recovery) -> 3 seq of 4 points (nb_points = 12)
        sax_info = ConfigSax(paa=3,
                             sequences_size=4,
                             with_mean=True,
                             with_std=True,
                             global_norm=True,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0,
                             coefficients=[0.1, 1],
                             alphabet_size=6)

        spark_ctx = ScManager.get()
        # Test with global normalization : the timeseries is normalized
        result, coeff = sliding_windows(ts_list=["linear_time_serie"],
                                        sax_info=sax_info,
                                        spark_ctx=spark_ctx)

        result = result.collect()
        coeff = coeff.collect()
        # Check coeff : coeff is the mean and variance of each sequence

        # 12 points no recovery (recovery=0) -> 3 seq of 4 points
        self.assertEqual(len(coeff), 3)

        # ts_value is an array with the sequences values
        ts_value = np.array([])
        for i, _ in enumerate(result):
            # result[i] = (key, list([timestamps, values],[,],...))
            ts_value = np.concatenate((result[i][1][:, 1], ts_value))

        LOGGER.info("result=%s", result)
        # no recovery => 2 seq * 6 points = 12 values = npoints
        self.assertEqual(len(ts_value), 12)

        LOGGER.info("ts_std=%s", (ts_value.std()))
        LOGGER.info("ts_mean=%s", np.mean(ts_value))
        # global normalisation => ts_value have a standard deviation of 1 and mean if 0
        self.assertTrue(1 - epsilon < np.std(ts_value) < 1 + epsilon)
        self.assertTrue(-epsilon < np.mean(ts_value) < epsilon)

        # Test with local normalization : all the sequences are normalized
        sax_info.global_norm = False
        sax_info.local_norm = True
        sax_info.linear_filter = True

        # Recovery = 1 : maximum recovery
        sax_info.recovery = 1
        result, coeff = sliding_windows(ts_list=["ts_with_constant_pattern"],
                                        sax_info=sax_info,
                                        spark_ctx=spark_ctx)
        result = result.collect()

        # Verify that each sequence are normalized
        for i, _ in enumerate(result):
            # result[i] = (key, list([timestamps, values],[,],...))
            seq_value = result[i][1][:, 1]
            self.assertTrue(1 - epsilon < np.std(seq_value) < 1 + epsilon)
            self.assertTrue(-epsilon < np.mean(seq_value) < epsilon)
Beispiel #16
0
def discretize_dataset(ds_name,
                       nb_buckets,
                       table_name,
                       operators_list=None,
                       nb_points_by_chunk=100000):
    """
    This function discretizes each time series provided through dataset name input:
    1. Interval between start date and end date of each time series is divided into nb_buckets interval of same size.
    2. Each operator from input list is processed on each bucket previously defined
    3. result is formatted as a table whose entries are :
                - each time series processed in rows
                - combinations of (each operator) X (each bucket number) in columns
    Result table contains also buckets definitions like (bucket_number, start date, end date)

    :param ds_name: name of the dataset processed
    :type ds_name:  str

    :param nb_buckets: number of buckets wanted for each time series of dataset
    :type nb_buckets: int

    :param table_name: name of the table
    :type table_name:  str

    :param operators_list: list of operators to be calculated on buckets from Operators class (see above)
    :type operators_list: list

    :param nb_points_by_chunk: size of chunks in number of points (assuming time series are periodic and without holes)
    :type nb_points_by_chunk: int


    :returns: a dict containing all data awaited by functional ikats type table
    :rtype: dict

    :raises TypeError: if ds_name is not a string or is None
    :raises TypeError: if nb_buckets is not an integer
    :raises ValueError: if nb_buckets is zero
    :raises ValueError: if operators_list is None
    :raises ValueError: if operators_list items are not in Operators class
    :raises TypeError: if operators_list items are not all string
    :raises ValueError: if number of buckets exceeds number of points for one time series
    """

    # Check inputs validity
    if ds_name is None or type(ds_name) is not str:
        raise TypeError('valid dataset name must be defined (got %s, type: %s)' % (ds_name, type(ds_name)))
    try:
        nb_buckets = int(nb_buckets)
    except:
        raise TypeError('Number of buckets must be an integer (got value %s)' % nb_buckets)
    if nb_buckets == 0:
        raise ValueError("Number of buckets must be not null")
    if operators_list is None:
        raise ValueError("operators list must be not null")
    elif type(operators_list) is not list:
        raise ValueError("operators list must be a list")
    elif not operators_list:
        raise ValueError("operators list must not be empty list")
    if table_name is None or re.match('^[a-zA-Z0-9-_]+$', table_name) is None:
        raise ValueError("Error in table name")

    # Check content of operators list provided
    for operator in operators_list:
        if type(operator) is not str:
            raise TypeError('Operator must be a string (got %s)' % (type(operator)))
        if operator not in [op.name for op in Operators]:
            raise ValueError("Operators (string) must be in the following values list : %s"
                             % [op.name for op in Operators])

    # Extract tsuid list from inputs
    tsuid_list = IkatsApi.ds.read(ds_name)['ts_list']

    # Get list of metadata for all TS
    meta_dict = IkatsApi.md.read(tsuid_list)

    # Initialize result
    result = {}

    try:
        LOGGER.info("Running discretization using Spark")
        # Create or get a spark Context
        sc = ScManager.get()

        # running discretization time series by time series
        for index, tsuid in enumerate(tsuid_list):
            result[tsuid] = {}
            LOGGER.info('Processing Discretization for TS %s (%s/%s)', tsuid, index + 1, len(tsuid_list))

            sd = int(meta_dict[tsuid]['ikats_start_date'])
            ed = int(meta_dict[tsuid]['ikats_end_date'])
            nb_points = int(meta_dict[tsuid]['qual_nb_points'])

            # using qual_ref_period if defined, extrapolating otherwise
            if 'qual_ref_period' in meta_dict[tsuid]:
                period = int(float(meta_dict[tsuid]['qual_ref_period']))
            else:
                period = int(float((ed - sd) / nb_points))

            # checking buckets size regarding time series size
            if nb_buckets > nb_points:
                msg = "Number of buckets exceeds number of points for ts (%s, %s)" % (tsuid, IkatsApi.ts.fid(tsuid))
                LOGGER.error(msg)
                raise ValueError(msg)

            # definition of buckets size in ms
            bucket_size_ms = ceil((ed - sd) / nb_buckets)

            # definition of spark chunks size in ms
            chunks_size_ms = nb_points_by_chunk * period

            # definition of buckets start/end dates
            buckets_timestamps = np.hstack((np.arange(sd, ed, bucket_size_ms, dtype=int), ed))
            buckets = [(buckets_timestamps[i] + 1, buckets_timestamps[i + 1]) for i in
                       range(len(buckets_timestamps) - 1)]

            # start date of first bucket is decreased of 1 ms to catch first time series value
            buckets[0] = (buckets[0][0] - 1, buckets[0][1])

            # add bucket number
            data_to_compute = [(a, b[0], b[1]) for a, b in enumerate(buckets)]

            # store buckets definition in results
            result[tsuid]['buckets'] = data_to_compute

            # starting spark process
            # OUTPUT : [(nb_bucket, sd_bucket, ed_bucket), ...]
            inputs = sc.parallelize(data_to_compute, len(data_to_compute))

            # INPUT :  [(nb_bucket, sd_bucket, ed_bucket), ...]
            # OUTPUT : [(nb_bucket, sd_chunk, ed_chunk), ...]
            # PROCESS : cut buckets into chunks of data if smaller and repartition rdd
            rdd_chunks_timestamps = inputs \
                .flatMap(lambda x: (_spark_chunk(x[0], x[1], x[2], chunks_size_ms)))

            # INPUT : [(nb_bucket, sd_chunk, ed_chunk), ...]
            # OUTPUT : [(nb_bucket, data_array), ...]
            # PROCESS : extract data within buckets
            rdd_chunks_data = rdd_chunks_timestamps \
                .map(lambda x: (x[0], IkatsApi.ts.read(tsuid_list=[tsuid], sd=int(x[1]), ed=int(x[2]))[0])) \
                .filter(lambda x: len(x[1]) > 0)

            # INPUT : [(nb_bucket, data_array), ...]
            # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...]
            # PROCESS : calculate operators on data chunks
            rdd_chunks_calc = rdd_chunks_data \
                .map(lambda x: _spark_calc_op_on_chunks(x[0], x[1], operators_list)) \
                .filter(lambda x: x is not None)

            # INPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...]
            # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...] reduced by number of bucket
            # PROCESS : reduce operators results on data buckets
            result_by_bucket = rdd_chunks_calc.reduceByKey(lambda x, y: _spark_reduce_op_chunk(x, y)).collect()

            # extract and calculate final results by bucket
            for bucket in result_by_bucket:
                bucket_nb = bucket[0]
                infos = bucket[1]

                result[tsuid][bucket_nb] = {}
                for operator in operators_list:
                    if operator == 'MIN':
                        result[tsuid][bucket_nb]['MIN'] = float(infos["MIN"])
                    if operator == 'MAX':
                        result[tsuid][bucket_nb]['MAX'] = float(infos["MAX"])
                    if operator == 'AVG':
                        # Computation of the final mean
                        avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"])
                        result[tsuid][bucket_nb]['AVG'] = avg_value
                    if operator == 'STD':
                        # Computation of the final mean and standard deviation
                        avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"])
                        # variance is caped to 0 because it could be negative
                        # (but very near to zero) due to substraction of
                        # very near floating point values
                        variance = max(float(float(infos["SQR_SUM"]) / int(infos["NB_POINTS"]) - avg_value ** 2), 0)
                        std_deviation = variance ** 0.5
                        result[tsuid][bucket_nb]['STD'] = std_deviation

        # format result to fit to table type

        description = "Result of Discretize operator with %s buckets for %s" % (nb_buckets, operators_list)
        table = _fill_table_structure_to_store(json_result=result,
                                               operators_list=operators_list,
                                               nb_buckets=nb_buckets,
                                               tsuid_list=tsuid_list,
                                               table_name=table_name,
                                               table_desc=description)

        # Save the table
        IkatsApi.table.create(data=dict(table))
    except Exception as error:
        msg = "Exception raised while discretizing with Spark"
        LOGGER.error(msg + ": %s " % error)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop()

    # Return the name of the table saved
    return table_name
Beispiel #17
0
def compute_slope(ts_list,
                  fid_suffix="_slope",
                  chunk_size=75000,
                  save_new_ts=True):
    """
    Compute the slope of a list of timeseries using spark

    This implementation computes slope for one TS at a time in a loop.
    To know the details of the computation, see the corresponding method

    :param ts_list: list of TS. Each item is a dict composed of a TSUID and a functional id
    :param fid_suffix: Functional identifier suffix of the final timeseries
    :param chunk_size: Number of points per chunk (assuming the TS is periodic)
    :param save_new_ts: True (default) if TS must be saved to database

    :type ts_list: list of dict
    :type fid_suffix: str
    :type chunk_size: int
    :type save_new_ts: bool

    :return: the new list of derived TS (same order as input)
    :rtype: list of dict

    :raise TypeError: if ts_list type is incompatible
    """

    # Check inputs
    if not isinstance(ts_list, list):
        raise TypeError("ts_list shall be a list")
    if len(ts_list) == 0:
        raise TypeError("ts_list must have at least one element")

    LOGGER.info('Computing Slope for %s TS', len(ts_list))

    tsuid_list = ts_list
    try:
        # Extract TSUID from ts_list
        tsuid_list = [x['tsuid'] for x in ts_list]
    except Exception:
        # Already a tsuid_list.
        # Getting the functional id for each ts
        ts_list = [{
            'tsuid': x,
            'funcId': IkatsApi.fid.read(x)
        } for x in ts_list]

    # Gather all metadata for the list of TS to compute slope
    md_list = IkatsApi.md.read(tsuid_list)

    # Results will be stored here
    results = []

    try:
        # Get Spark Context
        spark_context = ScManager.get()

        for index, tsuid in enumerate(tsuid_list):
            fid = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]
            LOGGER.info('Processing Slope for TS %s (%s/%s) (%s)', fid,
                        (index + 1), len(tsuid_list), tsuid)

            computed_tsuid, computed_fid = compute_slope_for_tsuid(
                spark_context=spark_context,
                fid=fid,
                fid_suffix=fid_suffix,
                tsuid=tsuid,
                md_list=md_list,
                chunk_size=chunk_size,
                save_new_ts=save_new_ts)

            # Append results to final results
            results.append({"tsuid": computed_tsuid, "funcId": computed_fid})
    except Exception:
        raise
    finally:
        # Stop spark context in all cases
        ScManager.stop()

    return results
Beispiel #18
0
def _resample(resampling_way,
              ts_list,
              resampling_period,
              adding_method=AddingMethod.LINEAR_INTERPOLATION,
              timestamp_position=TimestampPosition.BEG,
              aggregation_method=AggregationMethod.AVG,
              nb_points_by_chunk=50000,
              generate_metadata=False):
    """
    Function that effectively resamples (UP or DOWN according to resampling_way value) using Spark

    :param resampling_way: way of resampling (UP or DOWN)
    :type ts_list: ResamplingWay

    :param ts_list: list composing the TS information to resample [{'tsuid': xxx, 'funcId': yyy },...]
    :type ts_list: list of dict

    :param resampling_period: target period for resampling (in ms)
    :type resampling_period: int

    :param adding_method: Method to use for interpolation (see type AddingMethod for more information)
    :type adding_method: AddingMethod or str or int

    :param timestamp_position: timestamp position in the interval while downsampling
    :type timestamp_position: str ('BEG','MID','END')

    :param aggregation_method: aggregation method for downsampling
    :type aggregation_method: str ('MIN','MAX','MED','AVG','FIRST','LAST')

    :param nb_points_by_chunk: user defined number of points used for a spark chunk of data (after resampling)
    :type nb_points_by_chunk: int

    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
    :type generate_metadata: boolean (default : False)

    :returns: a list of dict [{'tsuid': xxx, 'funcId': yyy },...]
    :rtype: list of dict
    """

    if ts_list == []:
        return []

    fid_dict = dict()
    for ts in ts_list:
        fid_dict[ts['funcId']] = ts['tsuid']

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Extract tsuid list from inputs

    tsuid_list = [x["tsuid"] for x in ts_list]

    # Checking metadata availability before starting resampling
    meta_list = IkatsApi.md.read(tsuid_list)

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Timeseries %s : no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for resampling %s" %
                             tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata 'ikats_start_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No start date available for resampling [%s]" %
                             tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "meta data 'ikats_end_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No end date available for resampling [%s]" %
                             tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata qual_ref_period' for timeseries %s not found in base",
                tsuid)
            raise ValueError(
                "No reference period available for resampling [%s]" % tsuid)

        # Original timeseries information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original timeseries
        fid_origin = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]

        # Generate functional id for resulting timeseries
        if resampling_way == ResamplingWay.UP_SAMPLING:
            func_id = "%s_resampled_to_%sms_%s" % (
                fid_origin, str(resampling_period), str(adding_method))
        else:
            func_id = "%s_resampled_to_%sms_%s_%s" % (
                fid_origin, str(resampling_period), timestamp_position,
                aggregation_method)

        # Creating new reference in database for new timeseries
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation :

        # Computing elementary size which is the lowest common multiple between ref period and resampling period
        elementary_size = _lowest_common_multiple(ref_period,
                                                  resampling_period)

        # Seeking the number of elementary size which contains nb of points nearest to nb_points_by_chunk parameter
        # in order to compute the final data chunk size
        nb_points_for_elementary_size = int(elementary_size /
                                            resampling_period)
        data_chunk_size = int(nb_points_by_chunk /
                              nb_points_for_elementary_size) * elementary_size

        # Limit the size of data_chunk_size
        if data_chunk_size < elementary_size:
            data_chunk_size = elementary_size

        # Computing intervals for chunk definition
        interval_limits = np.hstack((np.arange(sd,
                                               ed,
                                               data_chunk_size,
                                               dtype=np.int64), ed))

        # from intervals we define chunk of data to compute
        # ex : intervals = [ 1, 2, 3] => 2 chunks [1, 2] and [2, 3]
        if len(interval_limits) > 2:
            # there is more than 2 limits for interval definition, i.e there is more than one chunk to compute
            data_to_compute.extend([(tsuid, func_id, i, interval_limits[i],
                                     interval_limits[i + 1])
                                    for i in range(len(interval_limits) - 1)])
        elif len(interval_limits) > 1:
            # only one chunk to compute
            data_to_compute.append(
                (tsuid, func_id, 0, interval_limits[0], interval_limits[1]))

        # in case last original point and last downsampled point are aligned => add a supplementary chunk to compute
        # last point
        if (interval_limits[-1] - sd) % resampling_period == 0:
            data_to_compute.append((tsuid, func_id, 1, interval_limits[-1],
                                    interval_limits[-1] + resampling_period))

    LOGGER.info("Running resampling using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    if resampling_way == ResamplingWay.UP_SAMPLING:
        spark_function = _spark_upsample
        args = adding_method
    else:
        spark_function = _spark_downsample
        args = (timestamp_position, aggregation_method)

    try:

        # OUTPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute,
                                           len(data_to_compute))

        # INPUT :  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS : read original data in database / filter chunk with no data
        rdd_data_with_chunk_index = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        if resampling_way == ResamplingWay.UP_SAMPLING:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
            # PROCESS : compute inter-chunks intervals / filter empty chunks
            rdd_data = _calc_inter_chunks(rdd=rdd_data_with_chunk_index) \
                .map(lambda x: (x[0], x[2])) \
                .filter(lambda x: not (len(x[1]) == 2 and (int(float(x[1][0][0])) == int(float(x[1][1][0])))))
        else:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array), ...]
            # PROCESS : suppress useless chunk indexes
            rdd_data = rdd_data_with_chunk_index.map(lambda x: (x[0], x[2]))

        # INPUT :  [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
        # OUTPUT : [((TSUID_origin, func_id), data_resampled_array), ...]
        # PROCESS : resample chunks of data to resampling_period
        rdd_resampled_data = rdd_data.map(
            lambda x: (x[0], spark_function(data=x[1], period=resampling_period, args=args))) \
            .filter(lambda x: len(x[1]) > 0)

        # INPUT :  [((TSUID_origin, func_id), data_resampled_array), ...]
        # OUTPUT : [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS : create resampled data in database / compute global start and end date
        identifiers = rdd_resampled_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while resampling with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop(
        )  # Post-processing : metadata import and return dict building

    # returns dict containing the results of the resampling
    # where key is the original TSUID and values are resampled TSUID and functional identifiers
    returned_dict = {}
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid,
                       md_name='qual_ref_period',
                       md_value=resampling_period,
                       data_type=DTYPE.number,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_start_date',
                       md_value=sd,
                       data_type=DTYPE.date,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_end_date',
                       md_value=ed,
                       data_type=DTYPE.date,
                       force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid,
                           name='qual_nb_points',
                           value=qual_nb_points,
                           data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        returned_dict[tsuid_origin] = {"tsuid": tsuid, 'funcId': func_id}

    return returned_dict
    def _apply_motif_global_same_words(self, activate_spark):
        """
        Test
        - with the global method to search the neighborhood motif,
        - with/without spark jobs according to activate_spark
        - and where the words are all the same
        """
        spark_context = ScManager.get()
        # Build the SAX result with large breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-300, -100, 100, 300],
                               sax_word='abcdeabcdeabcdeabcde')
        sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5)

        # Build the collision matrix result
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
            ], [
                100,
                0,
                0,
                0,
            ], [
                100,
                100,
                0,
                0,
            ], [
                100,
                100,
                100,
                0,
            ]]))

        # two identical cases here: brute force / with collisions
        for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]:
            #  mindist distances:
            #
            # [[ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]]

            # Build the class for motif search
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=0.01,
                                             collision_matrix=collision_matrix)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=1,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=0.01,
                neighborhood_method=method_opt)

            # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words)
            result = search_info.motif_neighborhood_global(
                30, recognition_info)

            self._print_mindist_mat(search_info)

            # The words corresponding to the six largest values cells have a MINDIST < radius
            self.assertEqual(len(result), 1)
            # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside
            self.assertEqual(result, [[0, 1, 2, 3]])
Beispiel #20
0
def cut_ds_from_metric(ds_name,
                       metric,
                       criteria,
                       group_by=None,
                       fid_pattern=None,
                       chunk_size=75000):
    """
    Entry point of the method that cut a dataset based on the criteria applied to the TS matching the metric

    The criteria expression is a python expression that will be converted to a lambda expression with 'M' used as metric
    value.
    Example: "M > 7 and M not in [1,2,6]"

    :param ds_name: name of the dataset to use
    :param metric: metric used as reference to find cut ranges
    :param criteria: criteria expression describing the value thresholds.
    :param group_by: metadata to iterate on each value (Default to None to not use this behaviour)
    :param fid_pattern: name of the generated TS.
                        Variables can be used:
                        - {fid}   : Functional identifier
                        - {M}     : metric
    :param chunk_size: Size of the ideal chunk (in number of points per chunk)

    :type ds_name: str
    :type metric: str
    :type criteria: str
    :type group_by: str or None
    :type fid_pattern: str
    :type chunk_size: int

    :return: the ts list of the generated TS. [{"funcId": "xx", "tsuid":"xx"}]
    :rtype: list

    :raises ValueError: if dataset is empty
    :raises ValueError: if metric is found several times in dataset
    :raises ValueError: if metric is not found in dataset
    :raises ValueError: if group_by doesn't have a matching reference
    :raises KeyError: if error in fid_pattern
    """

    # List of TS present in dataset
    ts_list = IkatsApi.ds.read(ds_name=ds_name)['ts_list']

    if len(ts_list) == 0:
        LOGGER.error("Dataset %s is empty", ds_name)
        raise ValueError("Dataset %s is empty" % ds_name)

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=ts_list)

    # List of all possible values encountered for the group by
    groups_list = None
    if group_by not in [None, ""]:
        # Get all the groups for this group by criterion
        groups_list = _find_all_groups(group_by, md_list)
        LOGGER.info("%s groups found for [%s]", len(groups_list), group_by)
    else:
        # Force to None
        group_by = None

    # Find the reference TS and all TS to cut using this ref
    grouped_ts_list = _find_ts_ref_group(ds_name=ds_name,
                                         md_list=md_list,
                                         metric=metric,
                                         ts_list=ts_list,
                                         group_by=group_by,
                                         group_by_list=groups_list)

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()

    try:
        result = []

        # For each group (processed in alphabetic order)
        for group in sorted(grouped_ts_list):
            result_iter = _cut_from_metric_for_group(
                chunk_size=chunk_size,
                criteria=criteria,
                ds_name=ds_name,
                fid_pattern=fid_pattern,
                md_list=md_list,
                metric=metric,
                spark_context=spark_context,
                group=grouped_ts_list[group])

            # Sort functional identifiers alphabetically)
            result.extend(sorted(result_iter, key=lambda x: x['funcId']))

        return result
    finally:
        ScManager.stop()
Beispiel #21
0
def dataset_cut_spark(tsuid_list, start, end, nb_points, nb_points_by_chunk, generate_metadata, meta_list):
    """
    Cutting dataset algorithm, using spark

    :param tsuid_list: list of tsuid
    :param start: start cut date
    :param end: end cut date
    :param nb_points: number of points to cut
    :param nb_points_by_chunk: number of points per chunk
    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
                              (default: False)
    :param meta_list: dict of metadata (tsuid is the key)

    :type tsuid_list: list
    :type start: int
    :type end: int or None
    :type nb_points: int or None
    :type generate_metadata: boolean
    :param meta_list: dict

    :return: list of dict {"tsuid": tsuid, "funcId": func_id}
    :rtype: list of dict

    :raise ValueError: if inputs are not filled properly (see called methods description)
    """

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Time series %s: no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for cutting %s" % tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_start_date' for time series %s not found in base", tsuid)
            raise ValueError("No start date available for cutting [%s]" % tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_end_date' for time series %s not found in base", tsuid)
            raise ValueError("No end date available for cutting [%s]" % tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'qual_ref_period' for time series %s not found in base", tsuid)
            raise ValueError("No reference period available for cutting [%s]" % tsuid)

        # Original time series information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original time series
        fid_origin = IkatsApi.ts.fid(tsuid)

        # Generate functional id for resulting time series
        func_id = "%s_cut_%d" % (fid_origin, time.time() * 1e6)

        # Creating new reference in database for new time series
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation:

        data_chunk_size = int(nb_points_by_chunk * ref_period)

        # Computing intervals for chunk definition
        interval_limits = np.hstack(np.arange(sd, ed, data_chunk_size, dtype=np.int64))

        # from intervals we define chunk of data to compute:
        #
        # 1. defining chunks excluding last point of data within every chunk
        # ex: intervals = [ 10, 20, 30, 40 ] => 2 chunks [10, 19] and [20, 29] (last chunk added in step 2)
        data_to_compute.extend([(tsuid,
                                 func_id,
                                 i,
                                 interval_limits[i],
                                 interval_limits[i + 1] - 1) for i in range(len(interval_limits) - 1)])
        # 2. adding last interval, including last point of data
        # ex: [30, 40]
        data_to_compute.append((tsuid,
                                func_id,
                                len(interval_limits) - 1,
                                interval_limits[-1],
                                ed + 1))

    LOGGER.info("Running dataset cut using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    try:

        # OUTPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute, len(data_to_compute))

        # INPUT:  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS: read original data in database / filter chunk with no data
        rdd_data = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        # INPUT:  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
        # PROCESS: cut chunks of data, filter empty results
        rdd_cut_chunk_data = rdd_data \
            .map(lambda x: (x[0], x[1], _spark_cut(data=x[2], min_date=start, max_date=end))) \
            .filter(lambda x: len(x[2][1]) > 0) \
            .cache()

        # no end cutting date provided => case of cutting a given number of points
        if end is None:

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), [(chunk_index1, nb_points1), (chunk_index2, nb_points2),...], ...]
            # PROCESS: Collect nb points associated to chunk indexes
            ts_pts_by_chunk = rdd_cut_chunk_data.map(lambda x: (x[0], (x[1], x[2][0]))) \
                .groupByKey().map(lambda x: (x[0], list(x[1]))) \
                .collect()

            # Compute for each ts from collected data:
            #   - last chunk index containing points to keep
            #   - the number of points to keep in this last chunk
            # cut_info: {(TSUID_origin1, func_id1):(last_chunk_index1, nb_points1),
            #             (TSUID_origin2, func_id2):(last_chunk_index2, nb_points2), ...}
            cut_info = {}
            for ts in ts_pts_by_chunk:
                nb_cumul = 0
                for chunk_index, points in ts[1]:
                    nb_cumul += points
                    # noinspection PyTypeChecker
                    if nb_cumul > nb_points:
                        # noinspection PyTypeChecker
                        cut_info[ts[0]] = (chunk_index, points - (nb_cumul - nb_points))
                        break
                else:
                    LOGGER.warning(
                        "Number of points cut with start cutting date provided exceeds time series %s size"
                        % IkatsApi.ts.fid(ts[0][0]))
                    # case nb_points > nb points of the time series
                    # noinspection PyTypeChecker
                    cut_info[ts[0]] = (chunk_index, points)

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.filter(lambda x: x[1] <= cut_info[x[0]][0]) \
                .map(lambda x: (x[0], x[2][1][:cut_info[x[0]][1]] if x[1] == cut_info[x[0]][0] else x[2][1]))

        else:
            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.map(lambda x: (x[0], x[2][1]))

        # INPUT:  [((TSUID_origin, func_id), data_cut_array), ...]
        # OUTPUT: [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS: create cut data in database / compute global start and end date
        identifiers = rdd_cut_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while cutting with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop()  # Post-processing: metadata import and return dict building

    # Returns list of dict containing the results of the cut time series: TSUID and functional identifiers
    results = []
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True)
        _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        results.append({"tsuid": tsuid, "funcId": func_id})

    return results
Beispiel #22
0
def cut_y(original_ts_list, criterion, fid_pattern="{fid}_cutY{compl}", chunk_size=75000):
    """
    Algorithm Cut-Y

    Cut among Y-axis (values) a list of timeseries matching a criterion defined as a python expression.
    Matching and non-matching values are separated into 2 timeseries

    This algorithm uses spark

    From the TS list provided (used as reference), extract 2 TS list:
    * The first one matching the value condition
    * The second one not matching the value condition

    :param original_ts_list: List of TSUID/funcID to use for filtering: [{tsuid:xxx, funcId:xxx}, ...]
    :param criterion: python expression used to define a matching pattern
    :param fid_pattern: pattern used to name the FID of the output TSUID.
           {fid} will be replaced by the FID of the original TSUID FID
           {M} will be replaced by the original TSUID metric name
           {compl} will be replaced by "" or "_compl" depending on the output type (matching/not matching).
    :param chunk_size: the number of points per chunk

    :type original_ts_list: list
    :type criterion: str
    :type fid_pattern: str
    :type chunk_size: int

    :return: 2 lists representing the "matching" and "not matching" list of TS corresponding to the input
    :rtype: list

    :raises ValueError: if ts_list is badly formatted
    :raises TypeError: if ts_list is not a list
    """

    # Check input validity
    if type(original_ts_list) is not list:
        raise TypeError("ts_list shall be a list")
    if len(original_ts_list) == 0:
        raise ValueError("ts_list shall have at least one element")
    for _, item in enumerate(original_ts_list):
        if "tsuid" not in item or "funcId" not in item:
            raise ValueError("ts_list shall have tsuid and funcId defined")

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=[x['tsuid'] for x in original_ts_list])

    # Prepare the spark items to parallelize

    # Create and build the data that will be used in spark transformations
    ts_list_with_new_fid, fid2tsuid = _prepare_spark_data(fid_pattern=fid_pattern,
                                                          md_list=md_list,
                                                          ts_list=original_ts_list)
    # Chunks computation
    ts_info = []
    for ts_data in ts_list_with_new_fid:

        # Get the chunks raw information
        chunks = SparkUtils.get_chunks(tsuid=ts_data[0], md_list=md_list, chunk_size=chunk_size)

        # Build a new list containing only used information
        for chunk in chunks:
            ts_info.append({
                "tsuid": ts_data[0],
                "start_date": chunk[1],
                "end_date": chunk[2],
                "matching_fid": ts_data[1],
                "not_matching_fid": ts_data[2],
                "matching_tsuid": fid2tsuid[ts_data[1]],
                "not_matching_tsuid": fid2tsuid[ts_data[2]]
            })

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()
    try:

        # Prepare the lambda expression. Value is replaced by "Y" variable name
        lambda_criterion = eval("lambda Y : " + criterion)

        # OUTPUT : [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # PROCESS : Parallelize TS chunks information
        rdd_ts_list = spark_context.parallelize(ts_info, max(8, len(ts_info)))

        # INPUT :  [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # OUTPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # PROCESS : Separate points matching and not-matching the criterion in every chunk. Fill the corresponding TS
        rdd_imported = rdd_ts_list.map(lambda x: _spark_cut_y_chunk(
            tsuid=x['tsuid'],
            start_date=x['start_date'],
            end_date=x['end_date'],
            match_criterion=lambda_criterion,
            result_info={
                "matching_fid": x['matching_fid'],
                "not_matching_fid": x['not_matching_fid'],
                "matching_tsuid": x['matching_tsuid'],
                "not_matching_tsuid": x['not_matching_tsuid']
            }))

        # INPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # OUTPUT : [(TSUID, nb_points, start_date, end_date), ...]
        # PROCESS : Flat the results and simplify the format to allow quick actions on every item
        rdd_metadata_prep = rdd_imported \
            .flatMap(lambda x: x) \
            .filter(lambda x: x is not None) \
            .map(lambda x: (x['tsuid'], x['numberOfSuccess'], x['start_date'], x['end_date']))

        # Delete empty TSUID
        deleted_tsuid = rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] == 0) \
            .map(lambda x: (x[0], IkatsApi.ts.delete(tsuid=x[0]))) \
            .map(lambda x: x[0]) \
            .collect()

        # This RDD is reused in several branches. Caching it improves the performances
        rdd_metadata_prep.cache()

        # Create metadata qual_nb_points
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] > 0) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="qual_nb_points", value=x[1]))

        # Create metadata ikats_start_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[2])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: min(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_start_date", value=x[1]))

        # Create metadata ikats_end_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[3])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: max(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_end_date", value=x[1]))

        # Unpersist the RDD because not used anymore
        rdd_metadata_prep.unpersist()

    finally:
        ScManager.stop()

    # Inherit properties
    for item in ts_list_with_new_fid:
        if fid2tsuid[item[1]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[1]], parent=item[0])
        if fid2tsuid[item[2]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[2]], parent=item[0])

    # Format and sort the results
    # First output contains the matched data points TS reference
    # Second output contains the not matched (complement) points TS reference
    return (_format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=1),
            _format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=2))
    def _apply_iter_coll_no_spark_ex1(self, activate_spark):
        """
         Tests motif_neighborhood_iterative()
         - the iterative method
         - using the heuristic based upon collisions
         - to search the neighborhood motif

         Note: test where the words have only one different letter.
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix
        # Note: this matrix is different from the one from
        #   test test_iterative__brute_no_spark_ex1:
        #    => see zeros are added: coll(3,2) == coll(4,2) == 0
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                40,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                0,
                0,
                0,
            ], [
                6,
                10,
                0,
                50,
                0,
            ]]))

        self._print_matrix("test_iterative__brute_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_BRUTE_FORCE
        #
        # iterative:  examining collisions (i,j) per iteration:
        #             (3,4) then (1,2) +(0,1)
        #
        #             (collisions greater than min_value==25)
        #
        # Test with fixed radius 1.9:
        #    - iter=1    => result is [[3, 4]] considering (S3,S4) neighborhood
        #    - iter=2    => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2)
        #    - iter=3    => result is the same than for iter=2: no more collision available
        #    - iter=100  => result is the same than for iter=2: no more collision available
        #
        for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]],
                                              [1.9, 2, [[3, 4], [0, 1, 2]]],
                                              [1.9, 3, [[3, 4], [0, 1, 2]]],
                                              [1.9, 100, [[3, 4], [0, 1, 2]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=nb_iter,
                min_value=25,
                is_algo_method_global=False,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            result = search_info.motif_neighborhood_iterative(
                recognition_info.min_value, recognition_info)

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
def main_test():
    """
    Functional test entry point
    """

    logger = logging.getLogger("ikats.algo.core.correlation")
    # Log format
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s')
    # Create another handler that will redirect log entries to STDOUT
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    if os.getenv("PYSPARK_PYTHON") is None:
        os.putenv("PYSPARK_PYTHON",
                  "/home/ikats/tools/ikats_processing/bin/python")
    if os.getenv("SPARK_HOME") is None:
        os.putenv("SPARK_HOME", "/opt/spark")

    print('Loading Spark Context')
    # Get a spark Context
    ScManager.get()

    tdm = TemporalDataMgr()

    answer = 'n'
    tsuid_list = []
    ds_name = ''
    while answer.lower() != 'y':
        ds_name = input('\nEnter dataset Name: ')
        tsuid_list = tdm.get_data_set(ds_name)['ts_list']

        print("%s TS found in dataset %s" % (len(tsuid_list), ds_name))

        if len(tsuid_list) > 0:
            answer = input(
                "Run the correlation matrix on these dataset? [Y/n] ")

    print('Running correlation matrix on %s TS' % len(tsuid_list))

    start_time = time.time()
    sp_corr = SparkCorrelation(tdm)
    sp_corr.force_parallel_get_ts = True
    sp_corr.run(tsuid_list)

    print(
        "EXECUTION TIME (for %d TS with %d pts/ea = %d points): %.3f seconds" %
        (len(tsuid_list), sp_corr.ts_len_ref,
         (len(tsuid_list) * sp_corr.ts_len_ref), (time.time() - start_time)))

    if os.path.isfile('/tmp/spark_correlation_result_%s.csv' % ds_name):
        os.remove('/tmp/spark_correlation_result_%s.csv' % ds_name)
    with open('/tmp/spark_correlation_result_%s.csv' % ds_name,
              'w',
              newline='') as opened_file:
        opened_file.write(sp_corr.get_csv())

    print("Matrix in CSV format is saved at the following location:")
    print("   /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("You can check the content by doing :")
    print("   cat /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("   less /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("   vi /tmp/spark_correlation_result_%s.csv" % ds_name)
Beispiel #25
0
def random_projections(ts_list, sax_info, collision_info, recognition_info):
    """
    The Random Projections Algorithm
    ================================

    This algorithm does the following (detailed for 1 TS but valid for many TS):
        * Apply the sliding window
        * Normalize the TS (global or/and local)
        * Filter the linear sequences (optional) and trivial matches
        * Apply the SAX algorithm
        * Build the collision matrix
        * Find the largest value cells in the collision matrix
        * Search the motif neighborhood

        ..note::
            The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the
            results.

    **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all
    configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized.

    :param ts_list: list of TSUID
    :type ts_list: list

    :param sax_info: the information to make the sliding window and the sax_algorithm
    :type sax_info: ConfigSax

    :param collision_info: the information to build the collision matrix
    :type collision_info: ConfigCollision

    :param recognition_info: the information to made the pattern _recognition
    :type recognition_info: ConfigRecognition

    :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list
    :type: list, str, float, list
    """
    LOGGER.info("Configurations deduced from user parameters:")
    LOGGER.info("- sliding sax nb paa=%s", sax_info.paa)
    LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size)
    LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size)
    LOGGER.info("- collision nb indexes=%s", collision_info.index)
    LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations)
    LOGGER.info("- collision accepted errors=%s", collision_info.errors)
    LOGGER.info("- recognition min_value=%s", recognition_info.min_value)
    LOGGER.info("- recognition iterations=%s", recognition_info.iterations)
    LOGGER.info("- recognition similarity radius=%s", recognition_info.radius)

    # Create or get a spark Context
    LOGGER.info("Running using Spark")
    spark_ctx = ScManager.get()

    # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... }
    # OUTPUT :  rdd_sequences_list = [ (key, sequence), ... ]
    # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs)
    rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list,
                                                                         sax_info=sax_info,
                                                                         spark_ctx=spark_ctx,
                                                                         trivial_radius=recognition_info.radius / 2)
    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : rdd_sax_result is a SaxResult object containing
    #  * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated
    #  * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1)
    #  * sax_word (large str): large string of all the SAX words concatenated
    # PROCESS : Give the SAX form of the sequences
    rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD!
    # PROCESS : transform rdd_sequences_list elements into dict
    sequences_list = rdd_sequences_list.collectAsMap()

    # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD!
    # PROCESS : transform rdd_normalization_coefficients elements into dict
    normalization_coefficients = rdd_normalization_coefficients.collectAsMap()

    # Keep only necessary information of each sequence
    sequences_list = sequences_info(sequences_list, normalization_coefficients)

    # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape)
    # (usefull for past-processing the random projection algorithm).
    breakpoints = [str(i) for i in rdd_sax_result.breakpoints]

    # Build the table which give the distance between two letters (need just sax_result.breakpoints)
    mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size)

    # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa)
    rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx)

    LOGGER.info("- filtered number of words=%s", number_of_sequences)

    if number_of_sequences == 1:
        LOGGER.info("- sliding window find just one sequence, no collision matrix computed.")
        collision_matrix = SparseMatrix(np.array([[0]]))
    else:

        # Build the collision matrix, the number of iteration can change
        # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified)
        collision_matrix, collision_info.nb_iterations = final_collision_matrix(
            sax=rdd_sax,
            number_of_iterations=collision_info.nb_iterations,
            index_selected=collision_info.index,
            word_len=sax_info.paa,
            spark_ctx=spark_ctx)

    # *collision_matrix* is a sparse matrix : light in memory

    # Give the result of the Equation 9
    eq9_result = equation9(number_of_sequences=number_of_sequences,
                           size_alphabet=sax_info.alphabet_size,
                           size_word=sax_info.paa,
                           errors=collision_info.errors,
                           index_selected=collision_info.index,
                           iterations=collision_info.nb_iterations)

    sax = rdd_sax.collect()
    paa_result = np.transpose(paa_result)

    distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size,
                                       mindist_lookup_table=mindist_lookup_table,
                                       alphabet_size=sax_info.alphabet_size,
                                       sax=sax,
                                       radius=recognition_info.radius,
                                       collision_matrix=collision_matrix)

    LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result,
                collision_info.errors)

    # Check the eq9_result with min_value
    if eq9_result < recognition_info.min_value:
        LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value)
        eq9_result = recognition_info.min_value
    if eq9_result < 1:
        LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1")
        eq9_result = 1

    # find the motif neighborhood by using the largest value cells in the collision matrix
    if recognition_info.is_algo_method_global is True:
        algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info)
    else:
        algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info)

    # Give the results with the names of sequences and not their number in the collision matrix
    algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result)

    algo_result = result_on_pattern_form(algo_result)

    # Give the alphabet used in the SAX algorithm
    alphabet = start_alphabet(sax_info.alphabet_size)

    result = {'patterns': algo_result,
              'break_points': breakpoints,
              'disc_break_points': alphabet}

    if spark_ctx is not None:
        ScManager.stop()
        LOGGER.info("Ended Spark session.")

    return result
Beispiel #26
0
def calc_quality_stats(ts_list,
                       compute_value=True,
                       compute_time=True,
                       chunk_size=75000,
                       force_save=True):
    """
    Compute the quality statistics

    Returns a dict as follow
        {
            "TSUIDx" : {
                "MetadataX": ValueX,
                ...
            },
            ...
        }

    Don't override default chunk_size unless you know what you are doing.
    It defines the number of points in a single chunk (assuming th TS is periodic)
    Use it only for performances purposes

    :param ts_list: List of TSUID to work onto
    :type ts_list: list

    :param compute_value: boolean indicating to compute metadata related to value
    :type compute_value: bool

    :param compute_time: boolean indicating to compute metadata related to time
    :type compute_time: bool

    :param chunk_size: (Advanced usage) Override the chunk size
    :type chunk_size: int

    :param force_save: Save metadata even if already present (default True)
    :type force_save: bool

    :return: Tuple composed of the input ts list and a dict
             having TSUID as key and a value being sub-dict
             where key is metadata name
    :rtype: tuple dict
    """

    if not compute_value and not compute_time:
        LOGGER.error("You shall compute at least one set of metadata.")
        raise ValueError("You shall compute at least one set of metadata")

    try:
        # Convert tsuid_list [{tsuid:x, fid:x},...] to tsuid_list [tsuid,...]
        tsuid_list = [x['tsuid'] for x in ts_list]

    except TypeError:
        # Already a tsuid_list. No change
        tsuid_list = ts_list

    LOGGER.info('Computing Quality stats for %s TS', len(tsuid_list))

    # Get all metadata
    md_list = IkatsApi.md.read(ts_list=tsuid_list)

    # Initialize results
    results = {}
    for tsuid in tsuid_list:
        results[tsuid] = {}

    try:
        # Get Spark Context
        # Important !!!! Use only this method in Ikats to use a spark context
        spark_context = ScManager.get()

        results = {}
        for index, tsuid in enumerate(tsuid_list):

            LOGGER.info('Processing Quality stats for TS %s (%s/%s)', tsuid,
                        index, len(tsuid_list))

            # Generating information about TSUID chunks
            # ([chunk_index, sd, ed], ...)
            ts_info = []
            for chunk_index in range(
                    _ts_chunk_count(tsuid=tsuid,
                                    md_list=md_list,
                                    chunk_size=chunk_size)):
                ts_info.append(
                    _ts_chunk(tsuid=tsuid,
                              index=chunk_index,
                              md_list=md_list,
                              chunk_size=chunk_size))

            # Parallelizing information to work with spark
            # Each chunk can be computed separately, so divided into len(chunks) partitions
            rdd_ts_info = spark_context.parallelize(ts_info,
                                                    max(8, len(ts_info)))

            # RDD containing the list of points values for every chunk of a TSUID
            # (without timestamps):
            # ([chunk_index, [[timestamp, value], ...], ...)
            rdd_ts_dps = rdd_ts_info \
                .map(lambda x: (x[0], _ts_read(tsuid=tsuid, start_date=x[1], end_date=x[2])))

            # This RDD is used multiple times, caching it to speed up
            rdd_ts_dps.cache()

            if compute_value:
                # Compute metadata related to "value" information
                result = calc_qual_stats_value(tsuid,
                                               rdd_ts_dps,
                                               force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            if compute_time:
                # Compute metadata related to "time" information
                result = calc_qual_stats_time(tsuid,
                                              rdd_ts_dps,
                                              force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            # We don't need the cache anymore
            rdd_ts_dps.unpersist()
    except Exception as cause:
        raise IkatsException("Quality stats failure with ...", cause)
    finally:
        ScManager.stop()
    return ts_list, results
Beispiel #27
0
def unwrap_ts_list(ts_list,
                   unit=TSUnit.Radians,
                   discontinuity=None,
                   fid_pattern="%(fid)s__unwrap",
                   use_spark=True):
    """
    Unwrap a list of TS by changing deltas between values to 2*discontinuity complement.
    Unwrap phase of each TS composing the dataset

    :param ts_list: list of TSUID to unwrap
    :param unit: TS unit : "Degrees" or "Radians" (default)
    :param discontinuity: Maximum discontinuity between values.
    :param fid_pattern: Pattern of the new FID ('%(fid)s' will be replaced by original FID)
    :param use_spark: Set to True to use spark. True is default

    :type ts_list: list
    :type unit: str or TSUnit
    :type discontinuity: float or None
    :type fid_pattern: str
    :type use_spark: bool

    :return: a new ts_list
    :rtype: list

    :raises TypeError: if input is not well formatted
    """

    if not isinstance(ts_list, list) or len(ts_list) == 0:
        raise TypeError("ts_list shall be a list having at least one TS")

    if discontinuity is None:
        raise ValueError("Discontinuity is not filled")

    results = []
    if use_spark:
        # Get Spark Context
        spark_context = ScManager.get()

        try:

            # Parallelize 1 TS = 1 partition
            rdd_ts_list = spark_context.parallelize(ts_list, len(ts_list))

            rdd_results = rdd_ts_list.map(
                lambda x: unwrap_tsuid(tsuid=x["tsuid"],
                                       fid=x["funcId"],
                                       fid_pattern=fid_pattern,
                                       discontinuity=discontinuity,
                                       unit=unit))

            # Persist data to not recompute them again
            # (Functional identifier reservation called multiple times through IkatsApi.ts.create_ref)
            rdd_results.cache()

            timings = rdd_results.map(lambda x: x[1]).reduce(
                lambda x, y: x + y)

            results = rdd_results.map(lambda x: x[0]).collect()

            rdd_results.unpersist()

            LOGGER.debug("Unwrapping %s TS using Spark: %s", len(ts_list),
                         timings.stats())
        finally:
            # Stop the context
            ScManager.stop()
    else:
        timings = Timings()
        for item in ts_list:
            tsuid = item["tsuid"]
            fid = item["funcId"]
            result, tsuid_timings = unwrap_tsuid(tsuid=tsuid,
                                                 fid=fid,
                                                 fid_pattern=fid_pattern,
                                                 discontinuity=discontinuity,
                                                 unit=unit)
            results.append(result)
            timings += tsuid_timings

        LOGGER.debug("Unwrapping %s TS: %s", len(ts_list), timings.stats())
    return results
Beispiel #28
0
def spark_ccf(tdm,
              tsuid_list_or_dataset,
              lag_max=None,
              tsuids_out=False,
              cut_ts=False):
    """
    This function calculates the maximum of the cross correlation function matrix between all ts
    in **tsuid_list_or_dataset** IN A DISTRIBUTED MODE (using spark)

    Cross correlation is a correlation between two timeseries whose one is delayed of successive lag
    values. Result of CCF is a timeseries (correlation function of the lag between timeseries).
    This function keep the maximum value of the CCF function generated and pull it in the matrix for
    corresponding timeseries couple.

    :returns: a string matrix (whose size is equal to the number of tsuids in tsuid_list_or_dataset
              plus one line and one column for headers)
    :rtype: ndarray

    :param tdm: Temporal Data Manager client
    :param tsuid_list_or_dataset: list of identifiers of the time series or dataset name
    :param lag_max: maximum lag between timeseries (cf. _ccf function for more details)
    :param tsuids_out: True to fill headers with tsuids
                       False to fill headers with functional ids
    :param cut_ts: Cut the TS list to the min-length if set to True

    :type tdm: TemporalDataMgr
    :type tsuid_list_or_dataset: list of str or str
    :type lag_max: positive int
    :type tsuids_out: boolean
    :type cut_ts: bool

    :raises TypeError: if tdm is not a TemporalDataMgr
    :raises TypeError: if tsuid_list_or_dataset is not a list nor a string
    :raises TypeError: if tsuids_out is not a boolean
    """
    if type(tdm) is not TemporalDataMgr:
        raise TypeError("tdm must be a TemporalDataMgr")

    if type(tsuid_list_or_dataset) is not list and type(
            tsuid_list_or_dataset) is not str:
        raise TypeError(
            "tsuid_list_or_dataset must be a list of string OR a string")

    if type(tsuids_out) is not bool:
        raise TypeError("tsuids_out must be a boolean")

    if type(cut_ts) is not bool:
        raise TypeError("cut_ts must be a boolean")

    if type(tsuid_list_or_dataset) is list:
        # input is a list of tsuid
        tsuid_list = tsuid_list_or_dataset
    else:
        # input is a dataset name
        dataset = tdm.get_data_set(tsuid_list_or_dataset)
        tsuid_list = dataset['ts_list']

    if tsuids_out:
        ts_list = tsuid_list
    else:
        ts_list = __retrieve_func_id(tdm, tsuid_list)

    md_list = tdm.get_meta_data(tsuid_list)

    # initialize size of time series
    min_ts_size = md_list[tsuid_list[0]]['qual_nb_points']

    if cut_ts:
        for ts in tsuid_list:
            min_ts_size = min(min_ts_size, md_list[ts]['qual_nb_points'])
    else:
        # check time series have same length
        for ts in tsuid_list:
            size_ts = md_list[ts]['qual_nb_points']
            if size_ts != min_ts_size:
                raise ValueError('time series do not have same length')

    # Create or get a spark Context
    sc = ScManager.get()

    # Build the RDD with TSUIDS
    rdd = sc.parallelize(tsuid_list)

    # Create a broadcast for spark jobs
    broadcast = sc.broadcast({
        "host": tdm.host,
        "port": tdm.port,
        "size_of_ts": min_ts_size,
        "lag_max": lag_max
    })

    # Create an accumulator to store the results of the spark workers
    accumulator = sc.accumulator(dict(), ListAccumulatorParam())

    def run_ccf_spark(working_tsuids):
        """
        Method called by spark job
        :param working_tsuids: rdd item
        :type working_tsuids: tuple
        """

        # cross correlation is equal to 1 if timeseries are the same
        if working_tsuids[0] == working_tsuids[1]:
            result = 1
        else:
            spark_tdm = TemporalDataMgr(host=broadcast.value['host'],
                                        port=broadcast.value['port'])

            result = __run_max_ccf_ts_list(tdm=spark_tdm,
                                           tsuids=list(working_tsuids),
                                           size=int(
                                               broadcast.value['size_of_ts']),
                                           lag_max=broadcast.value['lag_max'])

        accumulator.add({";".join(list(working_tsuids)): result})

    # Get TS content and perform ccf calculation using spark distribution to increase performance
    # for each element of rdd which is a couple of timeseries
    # the list of couples is first sorted then duplicates are suppressed to avoid doing same calculation
    # as for (a,b) and (b,a)
    rdd.cartesian(rdd).map(
        lambda x: tuple(sorted(list(x)))).distinct().foreach(run_ccf_spark)

    # Retrieving result from accumulator to fill matrix result
    ts_nb = len(tsuid_list)
    matrix_corr = np.zeros((ts_nb, ts_nb))
    for str_couple in accumulator.value:
        couple = str_couple.split(';')
        matrix_corr[
            tsuid_list.index(couple[0]),
            tsuid_list.index(couple[1])] = accumulator.value[str_couple]
        matrix_corr[
            tsuid_list.index(couple[1]),
            tsuid_list.index(couple[0])] = accumulator.value[str_couple]

    # fill final matrix with headers
    matrix = __fill_headers_to_final_matrix(matrix_corr, ts_list)

    return matrix
Beispiel #29
0
def correlation_ts_list_loop(ts_list,
                             corr_method,
                             context_meta,
                             variable_meta='metric',
                             config=ConfigCorrelationLoop(
                                 the_num_partitions=24,
                                 the_point_cache_size=50e6,
                                 the_digits_number=4)):
    """
    Computes the correlations between timeseries selected by observed variables and contexts.

    The observed contexts are defined by the context_meta argument.
    The variables are defined by variable_meta argument.

    Assumed:
      - Each context has a list of distinct variables.
      - Each timeseries is uniquely associated to one context and one variable.

    Example with Airbus data:
      - the *context* is a flight in an Airbus dataset of timeseries.
      - the *variables* could be metric 'WS1', metric 'WS2' etc.

    This algorithm is spark-distributed on the cluster.

    Spark summary
    *************

      - **step 1** The driver prepares a set of configured tuples: each tuple is configured for one context,
               and has a list of (variable, timeseries reference). Timeseries references are tsuids.

      - **step 2** A RDD is initialized from the set of cells **'configured tuples'**

      - **step 3** A new RDD is computed from step 2: each cell **'configured tuple'** is transformed into list of
        **'correlation inputs'**: this cell is prepared to be processed by the correlation method, for a
        subpart of the correlation matrice computed for one context

        At this step, each task task executes: *_spark_combine_pairs()*

      - **step 4** A new RDD is computed as set of **'correlation result'** cells from cells **'correlations inputs'**:
        each task will read timeseries pairs, compute the correlation result from selected method (Pearson, ...)

        At this step, each task task executes: *_spark_correlate_pairs()*

      - **step 5**: aggregates **'correlation result'** by variable pairs into RDD of
        **'aggregated correlations'** cells. Each task will

        1. creates and saves low-level results CorrelationsByContext into IKATS database, as JSON content.

          .. seealso:: the JSON is described in the
            ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

        2. returns **'aggregated correlation'** cells providing

          - pair of variable indexes
          - aggregated values: Mean, Variance
          - saved reference of CorrelationsByContext

        At this step, each task executes: *_spark_build_corrs_by_context()*

      - **step 6**: the driver collects the RDD of **'aggregated correlations'**, and computes the high-level result,
        which is a CorrelationDataset.

        Finally the JSON generated by CorrelationDataset is returned.

    :param ts_list: selected timeseries list on which are computed the correlations
    :type ts_list: list
    :param corr_method: the method computing the correlation between 2 timeseries.

      The value must be in CORRELATION_METHODS.

      Choose PEARSON to apply the pearson correlation.
    :type corr_method: str
    :param context_meta: name of the metadata identifying each observed context,
      where correlations are computed.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      With Airbus example: 'FlightIdentifier' identifies the flight as observed context.

    :type context_meta: str
    :param variable_meta: Optional, with default value 'metric',
      the name of the metadata identifying the variables.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      The metadata values will be sorted in a list providing the effective indexes of matrices:
      the correlation matrix: the N-th index is reserved to the timeseries having the N-th value of
      this metadata in alphanumeric order.

      It is advised to keep the default value: this advanced argument must provide distinct indexes for each
      timeseries under same observed context.

    :type variable_meta: str
    :return: JSON-friendly dict grouping

      - Matrix of means of correlations (see step5)

      - Matrix of variances of correlations (see step5)

      - Matrix of references to the JSON content of CorrelationByContext (see step 5)

      .. seealso:: detailed JSON structure in
        ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

    :rtype: dict as json-friendly structure for json library
    :raise exception: IkatsException when an error occurred while processing the correlations.
    """

    sc = None

    try:
        LOGGER.info("Starting correlation loop ...")
        LOGGER.info(" - observed contexts based on: %s", context_meta)
        LOGGER.info(" - variables ordered by: %s", variable_meta)

        # Check parameters
        corr_func = CORRELATION_FUNCTIONS.get(corr_method, None)
        if corr_func is None:
            msg = "Unknown correlation method from CORRELATION_FUNCTIONS: corr_method={}"
            raise IkatsException(msg.format(corr_method))

        if type(ts_list) is not list:
            msg = "Unexpected type: list expected for ts_list={}"
            raise IkatsException(msg.format(msg.format(ts_list)))

        if type(context_meta) is not str or len(context_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for context_meta={}"
            raise IkatsException(msg.format(msg.format(context_meta)))
        if type(variable_meta) is not str or len(variable_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for variable_meta={}"
            raise IkatsException(msg.format(msg.format(variable_meta)))

        # Hyp: the metadata part can be loaded from the driver

        ts_metadata_dict = IkatsApi.md.read(ts_list)

        # Note: the algorithm discards the variables X without Corr(X,Y) for Y different from X
        #       but when X is retained, the final result will present the Corr(X,X) beside the Corr(X,Y)
        corr_loop_config, sorted_contexts, sorted_variables = _initialize_config_from_meta(
            ts_metadata_dict,
            context_meta=context_meta,
            variable_meta=variable_meta)

        LOGGER.info("- sorted_contexts=%s", sorted_contexts)
        LOGGER.info("- sorted_variables=%s", sorted_variables)

        nb_contexts = len(sorted_contexts)

        if nb_contexts * len(sorted_variables) == 0:
            # Algo simply return empty result when there is no variable or no context consistent
            #
            # - case 1: case when there is no computable Corr(X, Y)
            #           where variables X and Y are different for the same context
            # - case 2: missing metadata for context_name => no context
            # - case 3: missing metadata for ordering_meta => no variable
            #
            LOGGER.warning("Empty result from selection=%s", ts_list)
            obj_empty_result = CorrelationDataset()
            obj_empty_result.set_contexts(contexts=sorted_contexts,
                                          meta_identifier=context_meta)
            obj_empty_result.set_variables(labels=sorted_variables)
            obj_empty_result.add_matrix(matrix=[],
                                        desc_label="Empty Mean correlation")
            obj_empty_result.add_matrix(
                matrix=[], desc_label="Empty Variance correlation")
            obj_empty_result.add_rid_matrix(matrix=[])

            return obj_empty_result.get_json_friendly_dict()

        # Computes the number of matrix chunks
        # one matrix chunk will be handled by one task at
        # -------------------------------------
        if nb_contexts < config.num_partitions:
            # Case when there are fewer contexts than recommended partitions:
            # - the computing of one matrix is split into several chunks
            nb_matrix_blocks = ceil(float(config.num_partitions) / nb_contexts)
        else:
            nb_matrix_blocks = 1

        LOGGER.info("- number of matrix blocks by context=%s",
                    nb_matrix_blocks)

        # Computes the timeseries LRU cache size used by one task
        # -------------------------------------------------------
        # 1/ retrieve nb points for each TS, default value is assumed to be 1e6 in order to be robust
        # in case 'qual_nb_points' is not available, (should not happen ...)
        defined_nb_points = [
            int(v.get('qual_nb_points', 1e6))
            for v in ts_metadata_dict.values()
        ]
        # 2/ evaluate the number of points by one task carrying one matrice chunk
        total_nb_points_by_ctx = sum(
            defined_nb_points) / nb_contexts / nb_matrix_blocks
        if config.the_point_cache_size >= total_nb_points_by_ctx:
            # the best condition:
            # system will memorize in the cache every loaded ts under the same matrice
            ts_cache_size = len(sorted_variables)
        else:
            # the case when it is required to limit the number TS memorized in the cache,
            # under the same row of correlation matrice
            # Note: len(sorted_variables) == max size of correlation row == dim matrice
            ts_cache_size = config.the_point_cache_size / total_nb_points_by_ctx * len(
                sorted_variables)
            ts_cache_size = ceil(max(2.0, ts_cache_size))
        LOGGER.info("- ts_cache_size=%s", ts_cache_size)

        # release ts_metadata_dict from memory
        ts_metadata_dict = None

        sc = ScManager.get()

        # Spark_step_1: initialize the RDD
        # ------------
        # OUTPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )

        rdd_initial_config = sc.parallelize(corr_loop_config,
                                            config.num_partitions)

        # Spark_step_2: combinate the pairs of timeseries by contexts and by chunks
        # ------------
        # INPUT:  RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )
        # OUTPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        #
        #    where <pair X_Y> is ((<var X index>, <tsuid X> ), (<var Y index>, <tsuid Y>))
        #
        # PROCESS: computes the cartesian product and split the list of pairs into smaller-sized lists
        #
        rdd_var_combinations = rdd_initial_config.flatMap(
            lambda x: _spark_combine_pairs(context=x[0],
                                           variables=x[1],
                                           nb_corr_matrix_blocks=
                                           nb_matrix_blocks))

        if nb_matrix_blocks > 1:
            # reshuffles all the data over the cluster ...
            rdd_var_combinations = rdd_var_combinations.repartition(
                nb_contexts * nb_matrix_blocks)

        # Spark_step_3: computes the correlations
        # ------------
        # INPUT:  RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> )
        #
        #  where
        #    <computed corr X_Y> is (<context>, (<tsuid X>, <tsuid Y>), correlation)
        #
        # PROCESS: computes the correlations on the timeseries associated to the variables
        #
        rdd_correlations = rdd_var_combinations.flatMap(
            lambda x: _spark_correlate_pairs(context=x[0],
                                             var_pairs=x[1],
                                             corr_method=corr_method,
                                             ts_cache_size=ts_cache_size))

        # generates the parent_id:
        #   presently this identifier may be used by Postgres admin,
        #   to group the low-level results attached to the same high-level result
        #   => at the moment a label including a timestamp is generated
        obj_result = CorrelationDataset()
        parent_id = obj_result.get_id()

        def r_append(data, computed_corr):
            """
            Append computed correlation to data
            :param data:
            :param computed_corr:
            :return:
            """
            data.append(computed_corr)
            return data

        def r_merge(one, two):
            """
            Merge two to one
            :param one:
            :param two:
            :return:
            """
            one.extend(two)
            return one

        # Spark_step_4: aggregate the correlations by pair of variables
        # ------------
        # INPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) as described previously
        #
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        # PROCESS: aggregates by key=(<var X index>, <var Y index>) the correlation information profiles,
        #          enhanced with tsuid pairs
        #
        rdd_agg_correlations = rdd_correlations.aggregateByKey(
            zeroValue=[], seqFunc=r_append, combFunc=r_merge)

        # Spark_step_5:
        # ------------
        # INPUT: RDD of  ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        #
        # OUTPUT: RDD of ( ( <var X index>, <var Y index>), <low-level Result ID>, <Mean correlation>, <Var correlation>
        #                )
        # PROCESS: - creates and saves aggregated low-level results as CorrelationsByContext
        #          - computes Mean and Variance of low-level results
        #          - returns summarized info: Mean+Variance+ result ID
        rdd_results_corr_by_context = \
            rdd_agg_correlations.map(lambda x: (_spark_build_corrs_by_context(variables=x[0],
                                                                              agg_ctx_ts_corr=x[1],
                                                                              desc_context=context_meta,
                                                                              sorted_variables=sorted_variables,
                                                                              sorted_contexts=sorted_contexts,
                                                                              corr_method=corr_method,
                                                                              parent_id=parent_id,
                                                                              ndigits=config.the_digits_number)))

        # Spark_step_6:
        # ------------
        #
        # 6.1: collects
        #
        # INPUT: RDD of  ( [ <var X index>, <var Y index>], <processdata ID>, <Mean(corr)>, <Var(corr)>
        #                )
        #
        # OUTPUT: collected list
        #
        # PROCESS:  collects high-level results
        #
        collected_results_corr = rdd_results_corr_by_context.collect()

        # 6.2: prepare the result
        #
        #  - Encodes the returned json-friendly content from the collected high-level results
        #  - returns the result
        #
        matrix_mean = get_triangular_matrix(dim=len(sorted_variables),
                                            default_value_diag=1.0,
                                            default_value_other=None)

        matrix_variance = get_triangular_matrix(dim=len(sorted_variables),
                                                default_value_diag=0.0,
                                                default_value_other=None)

        matrix_id = get_triangular_matrix(dim=len(sorted_variables),
                                          default_value_diag=None,
                                          default_value_other=None)

        for var_index_pair, data_oid, mean, variance in collected_results_corr:
            var_index_row = var_index_pair[0]
            var_index_col = var_index_pair[1]
            # required: recomputes the range of cell in its row
            # triangular matrix => cell(i,j) is at position j-i of the row triangular_matrix[i]
            matrix_mean[var_index_row][var_index_col - var_index_row] = mean
            matrix_variance[var_index_row][var_index_col -
                                           var_index_row] = variance
            matrix_id[var_index_row][var_index_col - var_index_row] = data_oid

        obj_result.set_contexts(contexts=sorted_contexts,
                                meta_identifier=context_meta)

        obj_result.set_variables(sorted_variables)
        obj_result.add_matrix(matrix=matrix_mean,
                              desc_label="Mean Correlation")
        obj_result.add_matrix(matrix=matrix_variance, desc_label="Variance")
        obj_result.add_rid_matrix(matrix_id)

        LOGGER.info("... ended correlation loop.")
        return obj_result.get_json_friendly_dict()

    except Exception:
        LOGGER.error("... ended correlation loop with error.")
        raise IkatsException("Failed execution: correlation_ts_loop()")
    finally:
        if sc:
            ScManager.stop()
Beispiel #30
0
    def test_sliding_window_recovery(self):
        """
        Testing the recovery parameter.
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[1, 1],
                             alphabet_size=6)
        ts_name = ["linear_time_serie"]
        spark_ctx = ScManager.get()
        # Test with recovery = 0.5
        result, _ = sliding_windows(ts_list=ts_name,
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        result = result.collect()
        # 2 sequences in the timeseries => 3 sequences at the end
        self.assertEqual(len(result), 3)

        # Test with MAX recovery
        # recovery = 1 (the maximum : 100 % <=> the next window start one point to the right)
        sax_info.recovery = 1.0
        result, _ = sliding_windows(ts_list=ts_name,
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)
        result = result.collect()

        # remember that in 'sliding_window' function, we call 'get_ts_mock(ts_name)[0]'
        ts = get_ts_mock(ts_name)[0]
        ts_val_0 = list(ts[0:6][:, 1])
        ts_val_1 = list(ts[6:12][:, 1])
        timestamp_0 = list(ts[0:6][:, 0])
        timestamp_1 = list(ts[6:12][:, 0])

        # Check the timestamp and the values of the two sequences
        # result[i] = (key, list([timestamps, values],[,],...))

        # check ts value
        condition = (np.all(result[i][1][:, 1] in ts_val_0
                            for i in range(len(result)))
                     or np.all(result[i][1][:, 1] in ts_val_1
                               for i in range(len(result))))

        self.assertTrue(condition)

        # check timestamps
        condition = (np.all(result[i][1][:, 0] in timestamp_0
                            for i in range(len(result)))
                     or np.all(result[i][1][:, 0] in timestamp_1
                               for i in range(len(result))))
        self.assertTrue(condition)

        # Test with MINIMUM recovery
        # recovery = 0 (no recovery)
        sax_info.recovery = 0.01
        result2, _ = sliding_windows(ts_list=ts_name,
                                     sax_info=sax_info,
                                     spark_ctx=spark_ctx)
        result2 = result2.collect()
        # 2 sequences in the timeseries => 2 sequences
        self.assertEqual(len(result2), 2)