def test_collision_same_words(self): """ The words are all the same """ sc = ScManager.get() sax_result = SaxResult(paa=sc.parallelize([]), breakpoints=[], sax_word='abcdabcdabcdabcd') sax, _, _ = sax_result.start_sax(4, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=6, index_selected=2, word_len=sequences_size, spark_ctx=sc) result = result.data # exactly the same words => six cells of maximum of combinations nb_cell = 0 for i in result: if i[0] == 6: nb_cell += 1 self.assertEqual(nb_cell, 6)
def test_sliding_window_sax_basic(self): """ Test the nominal case """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=3) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # recovery = 0.5 and word_size = 3 => sax_result = 'aab abc bcc' self.assertEqual(sax_result.sax_word, 'aababcbcc')
def test_sw_sax_limit_constant(self): """ Test sliding window and SAX on a constant timeseries with two greater values """ sax_info = ConfigSax(paa=10, sequences_size=10, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=5) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["specific_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) print("result={}".format(result.collect())) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) print("sax_word={}".format(sax_result.sax_word)) # PAA_value = 0 => 'c' # PAA_value = 10 => 'e' or 'd' # PAA_value = -10 => 'a' or 'b' self.assertTrue(sax_result.sax_word is 'ccccccccae' or sax_result.sax_word is 'ccccccccbd')
def test_coll_various_words(self): """ Test the collision matrix for same and different words The words 0 and 3 are the same, the words 1 and 2 too """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult(paa=sc.parallelize([]), breakpoints=[], sax_word=''.join( ['ababa', 'cdcdc', 'cdcdc', 'ababa'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) result = result.data result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]), int(x[1][1]))) print(result) # the maximum of possible combinations without repetitions is 10 # two cells of 10 : one for the occurrences between the words 1 and 2, and another for the words 0 and 3 for i in range(2): self.assertTrue(result[i][0] == 10) self.assertTrue( int(result[0][1][0]) == 2 and int(result[0][1][1]) == 1) self.assertTrue( int(result[1][1][0]) == 3 and int(result[1][1][1]) == 0)
def _apply_motif_iter_zero_coll(self, activate_spark): """ Test - with the iterative method to search the neighborhood motif, - with/without spark jobs - and where the words are all different => no collisions """ spark_context = ScManager.get() # Build the SAX result with different words, and small breakpoints sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-0.3, -0.1, 0.1, 0.3], sax_word='abcdebcdeacdeabdeabceabcd') sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(nb_seq) # Different words => only zero cells in the collision matrix collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq))) # Build the class for motif search search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=1000, collision_matrix=collision_matrix) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=100, min_value=1, is_algo_method_global=False, activate_spark=activate_spark, radius=1000, neighborhood_method=OPT_USING_BRUTE_FORCE) # neighborhood_method=OPT_USING_BRUTE_FORCE result = search_info.motif_neighborhood_iterative(30, recognition_info) # There is no similar sequences self.assertEqual(len(result), 0) # neighborhood_method=OPT_USING_COLLISIONS recognition_info.neighborhood_method = OPT_USING_COLLISIONS result = search_info.motif_neighborhood_iterative(30, recognition_info) # There is no similar sequences self.assertEqual(len(result), 0)
def __init__(self, tdm, ts_load_split_size=10): """ init the spark distance class :param tdm: the temporal data manager client :type tdm: TemporalDataMgr :param ts_load_split_size: size of TS packet to load from TDM :type ts_load_split_size: int """ self.tdm = tdm self.ts_load_split_size = ts_load_split_size self.spark_context = ScManager.get() self.logger = logging.getLogger(__name__)
def _run_all_in_master_memory(self, method): """ Run the spark pearson correlation by loading all the TS content (ie. values) in master memory Each coefficient will be computed by a worker (Spark decides the best choice to apply) """ # Create or get a spark Context spark_context = ScManager.get() # Get TS content rdd_content = self._get_ts(spark_context) # Job distribution is made by Statistics.corr (Spark correlation matrix calculation) self.results = Statistics.corr(rdd_content, method=method) ScManager.stop()
def test_coll_near_same_words(self): """ The words have 1, or 2, or 3, or 4 occurrences, but there are not exactly the same because words have five letters """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult( paa=sc.parallelize([]), breakpoints=[], sax_word=''.join(['aaaaa', 'abbbb', 'abccc', 'abcdd', 'abcde'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) # sorted result list result = result.data result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]), int(x[1][1]))) print(result) # sorted list expected: expected_result = [(1.0, (2, 1)), (1.0, (3, 1)), (3.0, (3, 2)), (1.0, (4, 1)), (3.0, (4, 2)), (6.0, (4, 3))] expected_result.sort(key=lambda x: "{}-{}-{}".format( int(x[0]), int(x[1][0]), int(x[1][1]))) self.assertEqual(len(result), len(expected_result)) for expected_item, res_item in zip(expected_result, result): self.assertEqual(expected_item[0], res_item[0], 'nb collisions') self.assertEqual(expected_item[1][0], res_item[1][0], 'seq index left-side') self.assertEqual(expected_item[1][1], res_item[1][1], 'seq index right-side')
def test_sliding_window_filter(self): """ Testing linear filter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=True, recovery=0.5, coefficients=[1, 0.5], alphabet_size=6) spark_ctx = ScManager.get() # Test for linear sequences result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # all sequences are linear => no sequence self.assertEqual(len(result), 0) # Test for constant sequences with a maximum recovery (= 0 => no overlap between sequences) sax_info.coefficients = [0, 1] sax_info.recovery = 0 result, _ = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() LOGGER.info("result=%s", result) LOGGER.info("ts_init=%s", get_ts_mock("ts_with_constant_pattern")) # Sequence of 12 pts, recovery = 0 (no recovery) -> 2 sequences self.assertEqual(len(result), 2)
def test_collision_different_words(self): """ The words are all different """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult( paa=sc.parallelize([]), breakpoints=[], sax_word=''.join(['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) result = result.data # different words => only zero cells in the matrix self.assertTrue(len(result) is 0)
def _apply_motif_global_coll_ex1(self, activate_spark): """ Test - with the global method to search the neighborhood motif, - with/without spark according to activate_spark - exploring similarities with collisions heuristic - with input: the words have only one different letter. And every sequence Si has collisions with Sj with that matrix. Note: results ought to be equal to test_global_brute_no_spark_ex1 """ # Build the SAX result where the words have only one different letter (words: 5 letters) sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"] tested_sax_word = ''.join(sequences) spark_context = ScManager.get() sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-1.1, -1, 0, 1.501], sax_word=tested_sax_word) sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(5) # Build a collision matrix (the real collision matrix is different, but we take this one for the test) collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, 0, ], [ 30, 0, 0, 0, 0, ], [ 2, 40, 0, 0, 0, ], [ 4, 8, 50, 0, 0, ], [ 6, 10, 20, 60, 0, ]])) self._print_matrix("test_global_coll_no_spark_ex1", collision_matrix.data, nb_seq) # mindist distances: # [[ 0. 0. 3.002 5.002 5.202] # [ 0. 0. 0. 2. 2.2 ] # [ 3.002 0. 0. 0. 0.2 ] # [ 5.002 2. 0. 0. 0. ] # [ 5.202 2.2 0.2 0. 0. ]] # Using neighborhood_method=OPT_USING_COLLISIONS # # for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25 # and with the collisions heuristic: only sequences having collisions with Si or Sj are examined # # for radius 1.9 => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]] # # for radius 2.5 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] # => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]] # # for radius 3.5 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]] # => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]] # # for radius 6 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]] # => reduced to [[0, 1, 2, 3, 4]] # for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]], [ 1.9, [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]] ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]], [6, [[0, 1, 2, 3, 4]]]]: # Build the class for motif search where the min_value is 25 search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=radius, collision_matrix=collision_matrix) # for info: here is the mindist: # (see _print_mindist_mat doc: in order to activate print) self._print_mindist_mat(search_info) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=0, min_value=25, is_algo_method_global=True, activate_spark=activate_spark, radius=radius, neighborhood_method=OPT_USING_COLLISIONS) print("radius {}:expected: {}".format( radius, expected_res)) result = search_info.motif_neighborhood_global( recognition_info.min_value, recognition_info) print("radius {}:->global with collisions: {}".format( radius, result)) self.assertEqual(len(result), len(expected_res)) for group in result: self.assertTrue(group in expected_res)
def run_paa_from_ts_list(tdm, ts_list, paa_size, out_ts=True, save=False, activate_spark=None): """ Compute the Piecewise Aggregation Approximation (PAA) on the **ts_list** provided Use spark if necessary :param tdm: temporal data manager object :type tdm: TemporalDataMgr :param ts_list: tsuid list of the TS to calculate the PAA timeseries :type ts_list: list :param paa_size: number of segments :type paa_size: int :param out_ts: True means the result will be a TS, False will return only the means :type out_ts: bool :param save: True means the new TS will be saved in addition of the return :type save: bool :param activate_spark: True to force spark, False to force local, None to let the algorithm decide :type activate_spark: bool or None :return: the array of the new TS resulting of the PAA approximation or the list of values (with len = paa_size) :rtype: list """ results = {} # Define if spark is necessary if activate_spark is None: md = tdm.get_meta_data(ts_list) sum_points = 0 for tsuid in md: if 'qual_nb_points' in md[tsuid]: sum_points += float(md[tsuid]['qual_nb_points']) else: # No information on number of points, consider using spark sum_points = 0 break spark_nb_points_trigger = 1E5 if sum_points == 0 or sum_points / len( ts_list) > spark_nb_points_trigger: # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points activate_spark = True if activate_spark: LOGGER.info("Running PAA using Spark") # Create or get a spark Context spark_context = ScManager.get() # Build the RDD with TSUIDS rdd = spark_context.parallelize(ts_list) # Create a broadcast for spark jobs broadcast = spark_context.broadcast({ "host": tdm.host, "port": tdm.port, "paa_size": paa_size, "out_ts": out_ts, "save": save, }) # Create an accumulator to store the results of the spark workers accumulator = spark_context.accumulator(dict(), ListAccumulatorParam()) def run_paa_spark(working_tsuid): """ Method called by spark job :param working_tsuid: rdd item """ spark_tdm = TemporalDataMgr(host=broadcast.value['host'], port=broadcast.value['port']) # noinspection PyBroadException try: results = run_paa_from_tsuid( tdm=spark_tdm, tsuid=working_tsuid, paa_size=broadcast.value['paa_size'], out_ts=broadcast.value['out_ts'], save=broadcast.value['save'])[:] except Exception: results = [] accumulator.add({working_tsuid: results}) # Get TS content using spark distribution to increase performance # noinspection PyBroadException try: rdd.foreach(run_paa_spark) except Exception: LOGGER.warning( 'Something wrong with spark, Using Local Computation') activate_spark = False for ts in ts_list: if ts in accumulator.value: results[ts] = accumulator.value[ts] else: LOGGER.warning( "TS %s has encountered an issue during the spark distribution", ts) ScManager.stop() if not activate_spark: LOGGER.info("Running PAA on single instance") for ts in ts_list: results[ts] = run_paa_from_tsuid(tdm=tdm, tsuid=ts, paa_size=paa_size, out_ts=out_ts, save=save) return results
def test_sax(self): """ Test with no calculate the PAA (4 PAA for 4 points in a sequence) and the PAA are equidistants """ sax_info = ConfigSax(paa=4, sequences_size=4, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) spark_ctx = ScManager.get() result, _ = sliding_windows( ts_list=["simple_sequences_ts0", "simple_sequences_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) LOGGER.info("sliding_windows done!") sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) result = result.collect() LOGGER.info("sax_result=%s", sax_result) LOGGER.info("result=%s", result) # the PAA : [[4, 4, 0, 2], [-2, 2, -2, 0]] self.assertEqual(sax_result.paa.collect(), [4, 4, 0, 2, -2, 2, -2, 0]) # the result expected : 'ddbc acab' self.assertEqual(sax_result.sax_word, 'ddbcacab') # Test with calculate the PAA sax_info = ConfigSax(paa=4, sequences_size=12, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) result, _ = sliding_windows( ts_list=["sequences_1_ts0", "sequences_1_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # the PAA : [[1, 4, -2, 1], [4, -2, -3, -3]] self.assertEqual(sax_result.paa.collect(), [1, 4, -2, 1, 4, -2, -3, -3]) # the result expected : 'cdbc dbaa' self.assertEqual(sax_result.sax_word, 'cdbcdbaa')
def run_sax_from_ts_list(ts_list, alphabet_size, word_size, normalize=False, activate_spark=None): """ Perform the Symbolic Aggregate Approximation (SAX) on the TSUID list provided in **ts_list** Use spark if necessary .. note:: If spark fails. The local computation will be performed :param ts_list: tsuid list of the TS to calculate the PAA timeseries :type ts_list: list :param alphabet_size: number of characters in result word :type alphabet_size: int :param word_size: number of segments :type word_size: int :param activate_spark: True to force spark, False to force local, None to let the algorithm decide :type activate_spark: bool or none :param normalize: Apply the normalization of the TS if True (False:default) :type normalize: bool :return: A list of dict composed of the PAA result, the SAX breakpoints, the SAX string and the points for all TSUID :rtype: list """ results = {} # Define if spark is necessary if activate_spark is None: md = IkatsApi.md.read(ts_list=ts_list) sum_points = 0 for tsuid in md: if 'qual_nb_points' in md[tsuid]: sum_points += float(md[tsuid]['qual_nb_points']) else: # No information on number of points, consider using spark sum_points = 0 break spark_nb_points_trigger = 1E5 if sum_points == 0 or sum_points / len( ts_list) > spark_nb_points_trigger: # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points activate_spark = True if activate_spark: LOGGER.info("Running SAX using Spark") # Create or get a spark Context spark_context = ScManager.get() # Build the RDD with TSUIDS rdd = spark_context.parallelize(ts_list) # Create a broadcast for spark jobs broadcast = spark_context.broadcast({ "alphabet_size": alphabet_size, "word_size": word_size, "normalize": normalize, }) # Create an accumulator to store the results of the spark workers accumulator = spark_context.accumulator(dict(), ListAccumulatorParam()) def run_sax_spark(working_tsuid): """ Method called by spark job :param working_tsuid: rdd item """ results = run_sax_from_tsuid( tsuid=working_tsuid, alphabet_size=broadcast.value['alphabet_size'], word_size=broadcast.value['word_size'], normalize=broadcast.value['normalize']) accumulator.add({working_tsuid: results}) # Get TS content using spark distribution to increase performance # noinspection PyBroadException try: rdd.foreach(run_sax_spark) except Exception: LOGGER.warning( 'Something wrong with spark, Using Local Computation') activate_spark = False for ts in ts_list: if ts in accumulator.value: results[ts] = accumulator.value[ts] else: LOGGER.warning( "TS %s has encountered an issue during the spark distribution", ts) ScManager.stop() if not activate_spark: LOGGER.info("Running SAX on single instance") for ts in ts_list: results[ts] = run_sax_from_tsuid(tsuid=ts, alphabet_size=alphabet_size, word_size=word_size, normalize=normalize) # print("TS=%s\nnorm=%s\nr=%s\n\n"%(ts,normalize,results[ts]['sax_breakpoints'][0])) return results
def test_sliding_window_norm(self): """ Testing global and local norm. """ epsilon = 1.0e-10 # recovery = 1 (no recovery) -> 3 seq of 4 points (nb_points = 12) sax_info = ConfigSax(paa=3, sequences_size=4, with_mean=True, with_std=True, global_norm=True, local_norm=False, linear_filter=False, recovery=0, coefficients=[0.1, 1], alphabet_size=6) spark_ctx = ScManager.get() # Test with global normalization : the timeseries is normalized result, coeff = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() coeff = coeff.collect() # Check coeff : coeff is the mean and variance of each sequence # 12 points no recovery (recovery=0) -> 3 seq of 4 points self.assertEqual(len(coeff), 3) # ts_value is an array with the sequences values ts_value = np.array([]) for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) ts_value = np.concatenate((result[i][1][:, 1], ts_value)) LOGGER.info("result=%s", result) # no recovery => 2 seq * 6 points = 12 values = npoints self.assertEqual(len(ts_value), 12) LOGGER.info("ts_std=%s", (ts_value.std())) LOGGER.info("ts_mean=%s", np.mean(ts_value)) # global normalisation => ts_value have a standard deviation of 1 and mean if 0 self.assertTrue(1 - epsilon < np.std(ts_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(ts_value) < epsilon) # Test with local normalization : all the sequences are normalized sax_info.global_norm = False sax_info.local_norm = True sax_info.linear_filter = True # Recovery = 1 : maximum recovery sax_info.recovery = 1 result, coeff = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # Verify that each sequence are normalized for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) seq_value = result[i][1][:, 1] self.assertTrue(1 - epsilon < np.std(seq_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(seq_value) < epsilon)
def discretize_dataset(ds_name, nb_buckets, table_name, operators_list=None, nb_points_by_chunk=100000): """ This function discretizes each time series provided through dataset name input: 1. Interval between start date and end date of each time series is divided into nb_buckets interval of same size. 2. Each operator from input list is processed on each bucket previously defined 3. result is formatted as a table whose entries are : - each time series processed in rows - combinations of (each operator) X (each bucket number) in columns Result table contains also buckets definitions like (bucket_number, start date, end date) :param ds_name: name of the dataset processed :type ds_name: str :param nb_buckets: number of buckets wanted for each time series of dataset :type nb_buckets: int :param table_name: name of the table :type table_name: str :param operators_list: list of operators to be calculated on buckets from Operators class (see above) :type operators_list: list :param nb_points_by_chunk: size of chunks in number of points (assuming time series are periodic and without holes) :type nb_points_by_chunk: int :returns: a dict containing all data awaited by functional ikats type table :rtype: dict :raises TypeError: if ds_name is not a string or is None :raises TypeError: if nb_buckets is not an integer :raises ValueError: if nb_buckets is zero :raises ValueError: if operators_list is None :raises ValueError: if operators_list items are not in Operators class :raises TypeError: if operators_list items are not all string :raises ValueError: if number of buckets exceeds number of points for one time series """ # Check inputs validity if ds_name is None or type(ds_name) is not str: raise TypeError('valid dataset name must be defined (got %s, type: %s)' % (ds_name, type(ds_name))) try: nb_buckets = int(nb_buckets) except: raise TypeError('Number of buckets must be an integer (got value %s)' % nb_buckets) if nb_buckets == 0: raise ValueError("Number of buckets must be not null") if operators_list is None: raise ValueError("operators list must be not null") elif type(operators_list) is not list: raise ValueError("operators list must be a list") elif not operators_list: raise ValueError("operators list must not be empty list") if table_name is None or re.match('^[a-zA-Z0-9-_]+$', table_name) is None: raise ValueError("Error in table name") # Check content of operators list provided for operator in operators_list: if type(operator) is not str: raise TypeError('Operator must be a string (got %s)' % (type(operator))) if operator not in [op.name for op in Operators]: raise ValueError("Operators (string) must be in the following values list : %s" % [op.name for op in Operators]) # Extract tsuid list from inputs tsuid_list = IkatsApi.ds.read(ds_name)['ts_list'] # Get list of metadata for all TS meta_dict = IkatsApi.md.read(tsuid_list) # Initialize result result = {} try: LOGGER.info("Running discretization using Spark") # Create or get a spark Context sc = ScManager.get() # running discretization time series by time series for index, tsuid in enumerate(tsuid_list): result[tsuid] = {} LOGGER.info('Processing Discretization for TS %s (%s/%s)', tsuid, index + 1, len(tsuid_list)) sd = int(meta_dict[tsuid]['ikats_start_date']) ed = int(meta_dict[tsuid]['ikats_end_date']) nb_points = int(meta_dict[tsuid]['qual_nb_points']) # using qual_ref_period if defined, extrapolating otherwise if 'qual_ref_period' in meta_dict[tsuid]: period = int(float(meta_dict[tsuid]['qual_ref_period'])) else: period = int(float((ed - sd) / nb_points)) # checking buckets size regarding time series size if nb_buckets > nb_points: msg = "Number of buckets exceeds number of points for ts (%s, %s)" % (tsuid, IkatsApi.ts.fid(tsuid)) LOGGER.error(msg) raise ValueError(msg) # definition of buckets size in ms bucket_size_ms = ceil((ed - sd) / nb_buckets) # definition of spark chunks size in ms chunks_size_ms = nb_points_by_chunk * period # definition of buckets start/end dates buckets_timestamps = np.hstack((np.arange(sd, ed, bucket_size_ms, dtype=int), ed)) buckets = [(buckets_timestamps[i] + 1, buckets_timestamps[i + 1]) for i in range(len(buckets_timestamps) - 1)] # start date of first bucket is decreased of 1 ms to catch first time series value buckets[0] = (buckets[0][0] - 1, buckets[0][1]) # add bucket number data_to_compute = [(a, b[0], b[1]) for a, b in enumerate(buckets)] # store buckets definition in results result[tsuid]['buckets'] = data_to_compute # starting spark process # OUTPUT : [(nb_bucket, sd_bucket, ed_bucket), ...] inputs = sc.parallelize(data_to_compute, len(data_to_compute)) # INPUT : [(nb_bucket, sd_bucket, ed_bucket), ...] # OUTPUT : [(nb_bucket, sd_chunk, ed_chunk), ...] # PROCESS : cut buckets into chunks of data if smaller and repartition rdd rdd_chunks_timestamps = inputs \ .flatMap(lambda x: (_spark_chunk(x[0], x[1], x[2], chunks_size_ms))) # INPUT : [(nb_bucket, sd_chunk, ed_chunk), ...] # OUTPUT : [(nb_bucket, data_array), ...] # PROCESS : extract data within buckets rdd_chunks_data = rdd_chunks_timestamps \ .map(lambda x: (x[0], IkatsApi.ts.read(tsuid_list=[tsuid], sd=int(x[1]), ed=int(x[2]))[0])) \ .filter(lambda x: len(x[1]) > 0) # INPUT : [(nb_bucket, data_array), ...] # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...] # PROCESS : calculate operators on data chunks rdd_chunks_calc = rdd_chunks_data \ .map(lambda x: _spark_calc_op_on_chunks(x[0], x[1], operators_list)) \ .filter(lambda x: x is not None) # INPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...] # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...] reduced by number of bucket # PROCESS : reduce operators results on data buckets result_by_bucket = rdd_chunks_calc.reduceByKey(lambda x, y: _spark_reduce_op_chunk(x, y)).collect() # extract and calculate final results by bucket for bucket in result_by_bucket: bucket_nb = bucket[0] infos = bucket[1] result[tsuid][bucket_nb] = {} for operator in operators_list: if operator == 'MIN': result[tsuid][bucket_nb]['MIN'] = float(infos["MIN"]) if operator == 'MAX': result[tsuid][bucket_nb]['MAX'] = float(infos["MAX"]) if operator == 'AVG': # Computation of the final mean avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"]) result[tsuid][bucket_nb]['AVG'] = avg_value if operator == 'STD': # Computation of the final mean and standard deviation avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"]) # variance is caped to 0 because it could be negative # (but very near to zero) due to substraction of # very near floating point values variance = max(float(float(infos["SQR_SUM"]) / int(infos["NB_POINTS"]) - avg_value ** 2), 0) std_deviation = variance ** 0.5 result[tsuid][bucket_nb]['STD'] = std_deviation # format result to fit to table type description = "Result of Discretize operator with %s buckets for %s" % (nb_buckets, operators_list) table = _fill_table_structure_to_store(json_result=result, operators_list=operators_list, nb_buckets=nb_buckets, tsuid_list=tsuid_list, table_name=table_name, table_desc=description) # Save the table IkatsApi.table.create(data=dict(table)) except Exception as error: msg = "Exception raised while discretizing with Spark" LOGGER.error(msg + ": %s " % error) raise IkatsException(msg) finally: # Stop spark Context ScManager.stop() # Return the name of the table saved return table_name
def compute_slope(ts_list, fid_suffix="_slope", chunk_size=75000, save_new_ts=True): """ Compute the slope of a list of timeseries using spark This implementation computes slope for one TS at a time in a loop. To know the details of the computation, see the corresponding method :param ts_list: list of TS. Each item is a dict composed of a TSUID and a functional id :param fid_suffix: Functional identifier suffix of the final timeseries :param chunk_size: Number of points per chunk (assuming the TS is periodic) :param save_new_ts: True (default) if TS must be saved to database :type ts_list: list of dict :type fid_suffix: str :type chunk_size: int :type save_new_ts: bool :return: the new list of derived TS (same order as input) :rtype: list of dict :raise TypeError: if ts_list type is incompatible """ # Check inputs if not isinstance(ts_list, list): raise TypeError("ts_list shall be a list") if len(ts_list) == 0: raise TypeError("ts_list must have at least one element") LOGGER.info('Computing Slope for %s TS', len(ts_list)) tsuid_list = ts_list try: # Extract TSUID from ts_list tsuid_list = [x['tsuid'] for x in ts_list] except Exception: # Already a tsuid_list. # Getting the functional id for each ts ts_list = [{ 'tsuid': x, 'funcId': IkatsApi.fid.read(x) } for x in ts_list] # Gather all metadata for the list of TS to compute slope md_list = IkatsApi.md.read(tsuid_list) # Results will be stored here results = [] try: # Get Spark Context spark_context = ScManager.get() for index, tsuid in enumerate(tsuid_list): fid = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0] LOGGER.info('Processing Slope for TS %s (%s/%s) (%s)', fid, (index + 1), len(tsuid_list), tsuid) computed_tsuid, computed_fid = compute_slope_for_tsuid( spark_context=spark_context, fid=fid, fid_suffix=fid_suffix, tsuid=tsuid, md_list=md_list, chunk_size=chunk_size, save_new_ts=save_new_ts) # Append results to final results results.append({"tsuid": computed_tsuid, "funcId": computed_fid}) except Exception: raise finally: # Stop spark context in all cases ScManager.stop() return results
def _resample(resampling_way, ts_list, resampling_period, adding_method=AddingMethod.LINEAR_INTERPOLATION, timestamp_position=TimestampPosition.BEG, aggregation_method=AggregationMethod.AVG, nb_points_by_chunk=50000, generate_metadata=False): """ Function that effectively resamples (UP or DOWN according to resampling_way value) using Spark :param resampling_way: way of resampling (UP or DOWN) :type ts_list: ResamplingWay :param ts_list: list composing the TS information to resample [{'tsuid': xxx, 'funcId': yyy },...] :type ts_list: list of dict :param resampling_period: target period for resampling (in ms) :type resampling_period: int :param adding_method: Method to use for interpolation (see type AddingMethod for more information) :type adding_method: AddingMethod or str or int :param timestamp_position: timestamp position in the interval while downsampling :type timestamp_position: str ('BEG','MID','END') :param aggregation_method: aggregation method for downsampling :type aggregation_method: str ('MIN','MAX','MED','AVG','FIRST','LAST') :param nb_points_by_chunk: user defined number of points used for a spark chunk of data (after resampling) :type nb_points_by_chunk: int :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points) :type generate_metadata: boolean (default : False) :returns: a list of dict [{'tsuid': xxx, 'funcId': yyy },...] :rtype: list of dict """ if ts_list == []: return [] fid_dict = dict() for ts in ts_list: fid_dict[ts['funcId']] = ts['tsuid'] # List of chunks of data and associated information to parallelize with Spark data_to_compute = [] # Extract tsuid list from inputs tsuid_list = [x["tsuid"] for x in ts_list] # Checking metadata availability before starting resampling meta_list = IkatsApi.md.read(tsuid_list) # Collecting information from metadata for tsuid in tsuid_list: if tsuid not in meta_list: LOGGER.error("Timeseries %s : no metadata found in base", tsuid) raise ValueError("No ikats metadata available for resampling %s" % tsuid) if 'ikats_start_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "Metadata 'ikats_start_date' for timeseries %s not found in base", tsuid) raise ValueError("No start date available for resampling [%s]" % tsuid) if 'ikats_end_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "meta data 'ikats_end_date' for timeseries %s not found in base", tsuid) raise ValueError("No end date available for resampling [%s]" % tsuid) if 'qual_ref_period' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "Metadata qual_ref_period' for timeseries %s not found in base", tsuid) raise ValueError( "No reference period available for resampling [%s]" % tsuid) # Original timeseries information retrieved from metadata sd = int(meta_list[tsuid]['ikats_start_date']) ed = int(meta_list[tsuid]['ikats_end_date']) ref_period = int(float(meta_list[tsuid]['qual_ref_period'])) # Get the functional identifier of the original timeseries fid_origin = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0] # Generate functional id for resulting timeseries if resampling_way == ResamplingWay.UP_SAMPLING: func_id = "%s_resampled_to_%sms_%s" % ( fid_origin, str(resampling_period), str(adding_method)) else: func_id = "%s_resampled_to_%sms_%s_%s" % ( fid_origin, str(resampling_period), timestamp_position, aggregation_method) # Creating new reference in database for new timeseries IkatsApi.ts.create_ref(func_id) # Prepare data to compute by defining intervals of final size nb_points_by_chunk # Chunk intervals computation : # Computing elementary size which is the lowest common multiple between ref period and resampling period elementary_size = _lowest_common_multiple(ref_period, resampling_period) # Seeking the number of elementary size which contains nb of points nearest to nb_points_by_chunk parameter # in order to compute the final data chunk size nb_points_for_elementary_size = int(elementary_size / resampling_period) data_chunk_size = int(nb_points_by_chunk / nb_points_for_elementary_size) * elementary_size # Limit the size of data_chunk_size if data_chunk_size < elementary_size: data_chunk_size = elementary_size # Computing intervals for chunk definition interval_limits = np.hstack((np.arange(sd, ed, data_chunk_size, dtype=np.int64), ed)) # from intervals we define chunk of data to compute # ex : intervals = [ 1, 2, 3] => 2 chunks [1, 2] and [2, 3] if len(interval_limits) > 2: # there is more than 2 limits for interval definition, i.e there is more than one chunk to compute data_to_compute.extend([(tsuid, func_id, i, interval_limits[i], interval_limits[i + 1]) for i in range(len(interval_limits) - 1)]) elif len(interval_limits) > 1: # only one chunk to compute data_to_compute.append( (tsuid, func_id, 0, interval_limits[0], interval_limits[1])) # in case last original point and last downsampled point are aligned => add a supplementary chunk to compute # last point if (interval_limits[-1] - sd) % resampling_period == 0: data_to_compute.append((tsuid, func_id, 1, interval_limits[-1], interval_limits[-1] + resampling_period)) LOGGER.info("Running resampling using Spark") # Create or get a spark Context spark_context = ScManager.get() if resampling_way == ResamplingWay.UP_SAMPLING: spark_function = _spark_upsample args = adding_method else: spark_function = _spark_downsample args = (timestamp_position, aggregation_method) try: # OUTPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] inputs = spark_context.parallelize(data_to_compute, len(data_to_compute)) # INPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] # OUTPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # PROCESS : read original data in database / filter chunk with no data rdd_data_with_chunk_index = inputs \ .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \ .filter(lambda x: len(x[2]) > 0) if resampling_way == ResamplingWay.UP_SAMPLING: # INPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...] # PROCESS : compute inter-chunks intervals / filter empty chunks rdd_data = _calc_inter_chunks(rdd=rdd_data_with_chunk_index) \ .map(lambda x: (x[0], x[2])) \ .filter(lambda x: not (len(x[1]) == 2 and (int(float(x[1][0][0])) == int(float(x[1][1][0]))))) else: # INPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT : [((TSUID_origin, func_id), original_data_array), ...] # PROCESS : suppress useless chunk indexes rdd_data = rdd_data_with_chunk_index.map(lambda x: (x[0], x[2])) # INPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...] # OUTPUT : [((TSUID_origin, func_id), data_resampled_array), ...] # PROCESS : resample chunks of data to resampling_period rdd_resampled_data = rdd_data.map( lambda x: (x[0], spark_function(data=x[1], period=resampling_period, args=args))) \ .filter(lambda x: len(x[1]) > 0) # INPUT : [((TSUID_origin, func_id), data_resampled_array), ...] # OUTPUT : [(TSUID_origin, func_id, TSUID, sd, ed), ...] # PROCESS : create resampled data in database / compute global start and end date identifiers = rdd_resampled_data \ .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1], data=x[1], generate_metadata=generate_metadata))) \ .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \ .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \ .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \ .collect() except Exception as err: msg = "Exception raised while resampling with Spark: %s " % err LOGGER.error(msg) raise IkatsException(msg) finally: # Stop spark Context ScManager.stop( ) # Post-processing : metadata import and return dict building # returns dict containing the results of the resampling # where key is the original TSUID and values are resampled TSUID and functional identifiers returned_dict = {} for timeseries in identifiers: tsuid_origin = timeseries[0] func_id = timeseries[1] tsuid = timeseries[2] sd = timeseries[3] ed = timeseries[4] # Import metadata in non temporal database _save_metadata(tsuid=tsuid, md_name='qual_ref_period', md_value=resampling_period, data_type=DTYPE.number, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True) # Retrieve imported number of points from database qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid) IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number, force_update=True) # Inherit from parent IkatsApi.ts.inherit(tsuid, tsuid_origin) # Fill returned list returned_dict[tsuid_origin] = {"tsuid": tsuid, 'funcId': func_id} return returned_dict
def _apply_motif_global_same_words(self, activate_spark): """ Test - with the global method to search the neighborhood motif, - with/without spark jobs according to activate_spark - and where the words are all the same """ spark_context = ScManager.get() # Build the SAX result with large breakpoints sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-300, -100, 100, 300], sax_word='abcdeabcdeabcdeabcde') sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5) # Build the collision matrix result collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, ], [ 100, 0, 0, 0, ], [ 100, 100, 0, 0, ], [ 100, 100, 100, 0, ]])) # two identical cases here: brute force / with collisions for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]: # mindist distances: # # [[ 0. 0. 0. 0.] # [ 0. 0. 0. 0.] # [ 0. 0. 0. 0.] # [ 0. 0. 0. 0.]] # Build the class for motif search search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=0.01, collision_matrix=collision_matrix) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=0, min_value=1, is_algo_method_global=True, activate_spark=activate_spark, radius=0.01, neighborhood_method=method_opt) # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words) result = search_info.motif_neighborhood_global( 30, recognition_info) self._print_mindist_mat(search_info) # The words corresponding to the six largest values cells have a MINDIST < radius self.assertEqual(len(result), 1) # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside self.assertEqual(result, [[0, 1, 2, 3]])
def cut_ds_from_metric(ds_name, metric, criteria, group_by=None, fid_pattern=None, chunk_size=75000): """ Entry point of the method that cut a dataset based on the criteria applied to the TS matching the metric The criteria expression is a python expression that will be converted to a lambda expression with 'M' used as metric value. Example: "M > 7 and M not in [1,2,6]" :param ds_name: name of the dataset to use :param metric: metric used as reference to find cut ranges :param criteria: criteria expression describing the value thresholds. :param group_by: metadata to iterate on each value (Default to None to not use this behaviour) :param fid_pattern: name of the generated TS. Variables can be used: - {fid} : Functional identifier - {M} : metric :param chunk_size: Size of the ideal chunk (in number of points per chunk) :type ds_name: str :type metric: str :type criteria: str :type group_by: str or None :type fid_pattern: str :type chunk_size: int :return: the ts list of the generated TS. [{"funcId": "xx", "tsuid":"xx"}] :rtype: list :raises ValueError: if dataset is empty :raises ValueError: if metric is found several times in dataset :raises ValueError: if metric is not found in dataset :raises ValueError: if group_by doesn't have a matching reference :raises KeyError: if error in fid_pattern """ # List of TS present in dataset ts_list = IkatsApi.ds.read(ds_name=ds_name)['ts_list'] if len(ts_list) == 0: LOGGER.error("Dataset %s is empty", ds_name) raise ValueError("Dataset %s is empty" % ds_name) # Get all the metadata md_list = IkatsApi.md.read(ts_list=ts_list) # List of all possible values encountered for the group by groups_list = None if group_by not in [None, ""]: # Get all the groups for this group by criterion groups_list = _find_all_groups(group_by, md_list) LOGGER.info("%s groups found for [%s]", len(groups_list), group_by) else: # Force to None group_by = None # Find the reference TS and all TS to cut using this ref grouped_ts_list = _find_ts_ref_group(ds_name=ds_name, md_list=md_list, metric=metric, ts_list=ts_list, group_by=group_by, group_by_list=groups_list) # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() try: result = [] # For each group (processed in alphabetic order) for group in sorted(grouped_ts_list): result_iter = _cut_from_metric_for_group( chunk_size=chunk_size, criteria=criteria, ds_name=ds_name, fid_pattern=fid_pattern, md_list=md_list, metric=metric, spark_context=spark_context, group=grouped_ts_list[group]) # Sort functional identifiers alphabetically) result.extend(sorted(result_iter, key=lambda x: x['funcId'])) return result finally: ScManager.stop()
def dataset_cut_spark(tsuid_list, start, end, nb_points, nb_points_by_chunk, generate_metadata, meta_list): """ Cutting dataset algorithm, using spark :param tsuid_list: list of tsuid :param start: start cut date :param end: end cut date :param nb_points: number of points to cut :param nb_points_by_chunk: number of points per chunk :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points) (default: False) :param meta_list: dict of metadata (tsuid is the key) :type tsuid_list: list :type start: int :type end: int or None :type nb_points: int or None :type generate_metadata: boolean :param meta_list: dict :return: list of dict {"tsuid": tsuid, "funcId": func_id} :rtype: list of dict :raise ValueError: if inputs are not filled properly (see called methods description) """ # List of chunks of data and associated information to parallelize with Spark data_to_compute = [] # Collecting information from metadata for tsuid in tsuid_list: if tsuid not in meta_list: LOGGER.error("Time series %s: no metadata found in base", tsuid) raise ValueError("No ikats metadata available for cutting %s" % tsuid) if 'ikats_start_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'ikats_start_date' for time series %s not found in base", tsuid) raise ValueError("No start date available for cutting [%s]" % tsuid) if 'ikats_end_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'ikats_end_date' for time series %s not found in base", tsuid) raise ValueError("No end date available for cutting [%s]" % tsuid) if 'qual_ref_period' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'qual_ref_period' for time series %s not found in base", tsuid) raise ValueError("No reference period available for cutting [%s]" % tsuid) # Original time series information retrieved from metadata sd = int(meta_list[tsuid]['ikats_start_date']) ed = int(meta_list[tsuid]['ikats_end_date']) ref_period = int(float(meta_list[tsuid]['qual_ref_period'])) # Get the functional identifier of the original time series fid_origin = IkatsApi.ts.fid(tsuid) # Generate functional id for resulting time series func_id = "%s_cut_%d" % (fid_origin, time.time() * 1e6) # Creating new reference in database for new time series IkatsApi.ts.create_ref(func_id) # Prepare data to compute by defining intervals of final size nb_points_by_chunk # Chunk intervals computation: data_chunk_size = int(nb_points_by_chunk * ref_period) # Computing intervals for chunk definition interval_limits = np.hstack(np.arange(sd, ed, data_chunk_size, dtype=np.int64)) # from intervals we define chunk of data to compute: # # 1. defining chunks excluding last point of data within every chunk # ex: intervals = [ 10, 20, 30, 40 ] => 2 chunks [10, 19] and [20, 29] (last chunk added in step 2) data_to_compute.extend([(tsuid, func_id, i, interval_limits[i], interval_limits[i + 1] - 1) for i in range(len(interval_limits) - 1)]) # 2. adding last interval, including last point of data # ex: [30, 40] data_to_compute.append((tsuid, func_id, len(interval_limits) - 1, interval_limits[-1], ed + 1)) LOGGER.info("Running dataset cut using Spark") # Create or get a spark Context spark_context = ScManager.get() try: # OUTPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] inputs = spark_context.parallelize(data_to_compute, len(data_to_compute)) # INPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] # OUTPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # PROCESS: read original data in database / filter chunk with no data rdd_data = inputs \ .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \ .filter(lambda x: len(x[2]) > 0) # INPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # PROCESS: cut chunks of data, filter empty results rdd_cut_chunk_data = rdd_data \ .map(lambda x: (x[0], x[1], _spark_cut(data=x[2], min_date=start, max_date=end))) \ .filter(lambda x: len(x[2][1]) > 0) \ .cache() # no end cutting date provided => case of cutting a given number of points if end is None: # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), [(chunk_index1, nb_points1), (chunk_index2, nb_points2),...], ...] # PROCESS: Collect nb points associated to chunk indexes ts_pts_by_chunk = rdd_cut_chunk_data.map(lambda x: (x[0], (x[1], x[2][0]))) \ .groupByKey().map(lambda x: (x[0], list(x[1]))) \ .collect() # Compute for each ts from collected data: # - last chunk index containing points to keep # - the number of points to keep in this last chunk # cut_info: {(TSUID_origin1, func_id1):(last_chunk_index1, nb_points1), # (TSUID_origin2, func_id2):(last_chunk_index2, nb_points2), ...} cut_info = {} for ts in ts_pts_by_chunk: nb_cumul = 0 for chunk_index, points in ts[1]: nb_cumul += points # noinspection PyTypeChecker if nb_cumul > nb_points: # noinspection PyTypeChecker cut_info[ts[0]] = (chunk_index, points - (nb_cumul - nb_points)) break else: LOGGER.warning( "Number of points cut with start cutting date provided exceeds time series %s size" % IkatsApi.ts.fid(ts[0][0])) # case nb_points > nb points of the time series # noinspection PyTypeChecker cut_info[ts[0]] = (chunk_index, points) # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...] rdd_cut_data = rdd_cut_chunk_data.filter(lambda x: x[1] <= cut_info[x[0]][0]) \ .map(lambda x: (x[0], x[2][1][:cut_info[x[0]][1]] if x[1] == cut_info[x[0]][0] else x[2][1])) else: # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...] rdd_cut_data = rdd_cut_chunk_data.map(lambda x: (x[0], x[2][1])) # INPUT: [((TSUID_origin, func_id), data_cut_array), ...] # OUTPUT: [(TSUID_origin, func_id, TSUID, sd, ed), ...] # PROCESS: create cut data in database / compute global start and end date identifiers = rdd_cut_data \ .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1], data=x[1], generate_metadata=generate_metadata))) \ .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \ .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \ .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \ .collect() except Exception as err: msg = "Exception raised while cutting with Spark: %s " % err LOGGER.error(msg) raise IkatsException(msg) finally: # Stop spark Context ScManager.stop() # Post-processing: metadata import and return dict building # Returns list of dict containing the results of the cut time series: TSUID and functional identifiers results = [] for timeseries in identifiers: tsuid_origin = timeseries[0] func_id = timeseries[1] tsuid = timeseries[2] sd = timeseries[3] ed = timeseries[4] # Import metadata in non temporal database _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True) # Retrieve imported number of points from database qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid) IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number, force_update=True) # Inherit from parent IkatsApi.ts.inherit(tsuid, tsuid_origin) # Fill returned list results.append({"tsuid": tsuid, "funcId": func_id}) return results
def cut_y(original_ts_list, criterion, fid_pattern="{fid}_cutY{compl}", chunk_size=75000): """ Algorithm Cut-Y Cut among Y-axis (values) a list of timeseries matching a criterion defined as a python expression. Matching and non-matching values are separated into 2 timeseries This algorithm uses spark From the TS list provided (used as reference), extract 2 TS list: * The first one matching the value condition * The second one not matching the value condition :param original_ts_list: List of TSUID/funcID to use for filtering: [{tsuid:xxx, funcId:xxx}, ...] :param criterion: python expression used to define a matching pattern :param fid_pattern: pattern used to name the FID of the output TSUID. {fid} will be replaced by the FID of the original TSUID FID {M} will be replaced by the original TSUID metric name {compl} will be replaced by "" or "_compl" depending on the output type (matching/not matching). :param chunk_size: the number of points per chunk :type original_ts_list: list :type criterion: str :type fid_pattern: str :type chunk_size: int :return: 2 lists representing the "matching" and "not matching" list of TS corresponding to the input :rtype: list :raises ValueError: if ts_list is badly formatted :raises TypeError: if ts_list is not a list """ # Check input validity if type(original_ts_list) is not list: raise TypeError("ts_list shall be a list") if len(original_ts_list) == 0: raise ValueError("ts_list shall have at least one element") for _, item in enumerate(original_ts_list): if "tsuid" not in item or "funcId" not in item: raise ValueError("ts_list shall have tsuid and funcId defined") # Get all the metadata md_list = IkatsApi.md.read(ts_list=[x['tsuid'] for x in original_ts_list]) # Prepare the spark items to parallelize # Create and build the data that will be used in spark transformations ts_list_with_new_fid, fid2tsuid = _prepare_spark_data(fid_pattern=fid_pattern, md_list=md_list, ts_list=original_ts_list) # Chunks computation ts_info = [] for ts_data in ts_list_with_new_fid: # Get the chunks raw information chunks = SparkUtils.get_chunks(tsuid=ts_data[0], md_list=md_list, chunk_size=chunk_size) # Build a new list containing only used information for chunk in chunks: ts_info.append({ "tsuid": ts_data[0], "start_date": chunk[1], "end_date": chunk[2], "matching_fid": ts_data[1], "not_matching_fid": ts_data[2], "matching_tsuid": fid2tsuid[ts_data[1]], "not_matching_tsuid": fid2tsuid[ts_data[2]] }) # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() try: # Prepare the lambda expression. Value is replaced by "Y" variable name lambda_criterion = eval("lambda Y : " + criterion) # OUTPUT : [{ # tsuid:x, # start_date:x, # end_date:x, # matching_fid:x, # not_matching_fid:x, # matching_tsuid:x, # not_matching_tsuid:x # }, ...] # PROCESS : Parallelize TS chunks information rdd_ts_list = spark_context.parallelize(ts_info, max(8, len(ts_info))) # INPUT : [{ # tsuid:x, # start_date:x, # end_date:x, # matching_fid:x, # not_matching_fid:x, # matching_tsuid:x, # not_matching_tsuid:x # }, ...] # OUTPUT : [({ # start_date: "date of the first point matching the criterion in the current chunk" # end_date: "date of the last point matching the criterion in the current chunk" # numberOfSuccess: "number of points matching the criterion in the current chunk" # tsuid: "TSUID of the matching part" # }, # { # start_date: "date of the first point not matching the criterion in the current chunk" # end_date: "date of the last point not matching the criterion in the current chunk" # numberOfSuccess: "number of points not matching the criterion in the current chunk" # tsuid: "TSUID of the non-matching part" # }), ...] # PROCESS : Separate points matching and not-matching the criterion in every chunk. Fill the corresponding TS rdd_imported = rdd_ts_list.map(lambda x: _spark_cut_y_chunk( tsuid=x['tsuid'], start_date=x['start_date'], end_date=x['end_date'], match_criterion=lambda_criterion, result_info={ "matching_fid": x['matching_fid'], "not_matching_fid": x['not_matching_fid'], "matching_tsuid": x['matching_tsuid'], "not_matching_tsuid": x['not_matching_tsuid'] })) # INPUT : [({ # start_date: "date of the first point matching the criterion in the current chunk" # end_date: "date of the last point matching the criterion in the current chunk" # numberOfSuccess: "number of points matching the criterion in the current chunk" # tsuid: "TSUID of the matching part" # }, # { # start_date: "date of the first point not matching the criterion in the current chunk" # end_date: "date of the last point not matching the criterion in the current chunk" # numberOfSuccess: "number of points not matching the criterion in the current chunk" # tsuid: "TSUID of the non-matching part" # }), ...] # OUTPUT : [(TSUID, nb_points, start_date, end_date), ...] # PROCESS : Flat the results and simplify the format to allow quick actions on every item rdd_metadata_prep = rdd_imported \ .flatMap(lambda x: x) \ .filter(lambda x: x is not None) \ .map(lambda x: (x['tsuid'], x['numberOfSuccess'], x['start_date'], x['end_date'])) # Delete empty TSUID deleted_tsuid = rdd_metadata_prep \ .map(lambda x: (x[0], x[1])) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] == 0) \ .map(lambda x: (x[0], IkatsApi.ts.delete(tsuid=x[0]))) \ .map(lambda x: x[0]) \ .collect() # This RDD is reused in several branches. Caching it improves the performances rdd_metadata_prep.cache() # Create metadata qual_nb_points rdd_metadata_prep \ .map(lambda x: (x[0], x[1])) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] > 0) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="qual_nb_points", value=x[1])) # Create metadata ikats_start_date rdd_metadata_prep \ .map(lambda x: (x[0], x[2])) \ .filter(lambda x: x[1] is not None) \ .reduceByKey(lambda x, y: min(x, y)) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_start_date", value=x[1])) # Create metadata ikats_end_date rdd_metadata_prep \ .map(lambda x: (x[0], x[3])) \ .filter(lambda x: x[1] is not None) \ .reduceByKey(lambda x, y: max(x, y)) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_end_date", value=x[1])) # Unpersist the RDD because not used anymore rdd_metadata_prep.unpersist() finally: ScManager.stop() # Inherit properties for item in ts_list_with_new_fid: if fid2tsuid[item[1]] not in deleted_tsuid: IkatsApi.ts.inherit(tsuid=fid2tsuid[item[1]], parent=item[0]) if fid2tsuid[item[2]] not in deleted_tsuid: IkatsApi.ts.inherit(tsuid=fid2tsuid[item[2]], parent=item[0]) # Format and sort the results # First output contains the matched data points TS reference # Second output contains the not matched (complement) points TS reference return (_format_output(deleted_tsuid=deleted_tsuid, fid2tsuid=fid2tsuid, ts_list_with_new_fid=ts_list_with_new_fid, index=1), _format_output(deleted_tsuid=deleted_tsuid, fid2tsuid=fid2tsuid, ts_list_with_new_fid=ts_list_with_new_fid, index=2))
def _apply_iter_coll_no_spark_ex1(self, activate_spark): """ Tests motif_neighborhood_iterative() - the iterative method - using the heuristic based upon collisions - to search the neighborhood motif Note: test where the words have only one different letter. """ # Build the SAX result where the words have only one different letter (words: 5 letters) sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"] tested_sax_word = ''.join(sequences) spark_context = ScManager.get() sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-1.1, -1, 0, 1.501], sax_word=tested_sax_word) sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(5) # Build a collision matrix # Note: this matrix is different from the one from # test test_iterative__brute_no_spark_ex1: # => see zeros are added: coll(3,2) == coll(4,2) == 0 collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, 0, ], [ 40, 0, 0, 0, 0, ], [ 2, 40, 0, 0, 0, ], [ 4, 8, 0, 0, 0, ], [ 6, 10, 0, 50, 0, ]])) self._print_matrix("test_iterative__brute_no_spark_ex1", collision_matrix.data, nb_seq) # mindist distances: # [[ 0. 0. 3.002 5.002 5.202] # [ 0. 0. 0. 2. 2.2 ] # [ 3.002 0. 0. 0. 0.2 ] # [ 5.002 2. 0. 0. 0. ] # [ 5.202 2.2 0.2 0. 0. ]] # Using neighborhood_method=OPT_USING_BRUTE_FORCE # # iterative: examining collisions (i,j) per iteration: # (3,4) then (1,2) +(0,1) # # (collisions greater than min_value==25) # # Test with fixed radius 1.9: # - iter=1 => result is [[3, 4]] considering (S3,S4) neighborhood # - iter=2 => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2) # - iter=3 => result is the same than for iter=2: no more collision available # - iter=100 => result is the same than for iter=2: no more collision available # for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]], [1.9, 2, [[3, 4], [0, 1, 2]]], [1.9, 3, [[3, 4], [0, 1, 2]]], [1.9, 100, [[3, 4], [0, 1, 2]]]]: # Build the class for motif search where the min_value is 25 search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=radius, collision_matrix=collision_matrix) # for info: here is the mindist: # (see _print_mindist_mat doc: in order to activate print) self._print_mindist_mat(search_info) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=nb_iter, min_value=25, is_algo_method_global=False, activate_spark=activate_spark, radius=radius, neighborhood_method=OPT_USING_COLLISIONS) result = search_info.motif_neighborhood_iterative( recognition_info.min_value, recognition_info) self.assertEqual(len(result), len(expected_res)) for group in result: self.assertTrue(group in expected_res)
def main_test(): """ Functional test entry point """ logger = logging.getLogger("ikats.algo.core.correlation") # Log format logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s') # Create another handler that will redirect log entries to STDOUT stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) if os.getenv("PYSPARK_PYTHON") is None: os.putenv("PYSPARK_PYTHON", "/home/ikats/tools/ikats_processing/bin/python") if os.getenv("SPARK_HOME") is None: os.putenv("SPARK_HOME", "/opt/spark") print('Loading Spark Context') # Get a spark Context ScManager.get() tdm = TemporalDataMgr() answer = 'n' tsuid_list = [] ds_name = '' while answer.lower() != 'y': ds_name = input('\nEnter dataset Name: ') tsuid_list = tdm.get_data_set(ds_name)['ts_list'] print("%s TS found in dataset %s" % (len(tsuid_list), ds_name)) if len(tsuid_list) > 0: answer = input( "Run the correlation matrix on these dataset? [Y/n] ") print('Running correlation matrix on %s TS' % len(tsuid_list)) start_time = time.time() sp_corr = SparkCorrelation(tdm) sp_corr.force_parallel_get_ts = True sp_corr.run(tsuid_list) print( "EXECUTION TIME (for %d TS with %d pts/ea = %d points): %.3f seconds" % (len(tsuid_list), sp_corr.ts_len_ref, (len(tsuid_list) * sp_corr.ts_len_ref), (time.time() - start_time))) if os.path.isfile('/tmp/spark_correlation_result_%s.csv' % ds_name): os.remove('/tmp/spark_correlation_result_%s.csv' % ds_name) with open('/tmp/spark_correlation_result_%s.csv' % ds_name, 'w', newline='') as opened_file: opened_file.write(sp_corr.get_csv()) print("Matrix in CSV format is saved at the following location:") print(" /tmp/spark_correlation_result_%s.csv" % ds_name) print("You can check the content by doing :") print(" cat /tmp/spark_correlation_result_%s.csv" % ds_name) print(" less /tmp/spark_correlation_result_%s.csv" % ds_name) print(" vi /tmp/spark_correlation_result_%s.csv" % ds_name)
def random_projections(ts_list, sax_info, collision_info, recognition_info): """ The Random Projections Algorithm ================================ This algorithm does the following (detailed for 1 TS but valid for many TS): * Apply the sliding window * Normalize the TS (global or/and local) * Filter the linear sequences (optional) and trivial matches * Apply the SAX algorithm * Build the collision matrix * Find the largest value cells in the collision matrix * Search the motif neighborhood ..note:: The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the results. **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized. :param ts_list: list of TSUID :type ts_list: list :param sax_info: the information to make the sliding window and the sax_algorithm :type sax_info: ConfigSax :param collision_info: the information to build the collision matrix :type collision_info: ConfigCollision :param recognition_info: the information to made the pattern _recognition :type recognition_info: ConfigRecognition :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list :type: list, str, float, list """ LOGGER.info("Configurations deduced from user parameters:") LOGGER.info("- sliding sax nb paa=%s", sax_info.paa) LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size) LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size) LOGGER.info("- collision nb indexes=%s", collision_info.index) LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations) LOGGER.info("- collision accepted errors=%s", collision_info.errors) LOGGER.info("- recognition min_value=%s", recognition_info.min_value) LOGGER.info("- recognition iterations=%s", recognition_info.iterations) LOGGER.info("- recognition similarity radius=%s", recognition_info.radius) # Create or get a spark Context LOGGER.info("Running using Spark") spark_ctx = ScManager.get() # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... } # OUTPUT : rdd_sequences_list = [ (key, sequence), ... ] # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...] # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs) rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list, sax_info=sax_info, spark_ctx=spark_ctx, trivial_radius=recognition_info.radius / 2) # INPUT : rdd_sequences_list = [ (key, sequence), ... ] # OUTPUT : rdd_sax_result is a SaxResult object containing # * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated # * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1) # * sax_word (large str): large string of all the SAX words concatenated # PROCESS : Give the SAX form of the sequences rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # INPUT : rdd_sequences_list = [ (key, sequence), ... ] # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD! # PROCESS : transform rdd_sequences_list elements into dict sequences_list = rdd_sequences_list.collectAsMap() # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...] # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD! # PROCESS : transform rdd_normalization_coefficients elements into dict normalization_coefficients = rdd_normalization_coefficients.collectAsMap() # Keep only necessary information of each sequence sequences_list = sequences_info(sequences_list, normalization_coefficients) # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape) # (usefull for past-processing the random projection algorithm). breakpoints = [str(i) for i in rdd_sax_result.breakpoints] # Build the table which give the distance between two letters (need just sax_result.breakpoints) mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size) # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa) rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx) LOGGER.info("- filtered number of words=%s", number_of_sequences) if number_of_sequences == 1: LOGGER.info("- sliding window find just one sequence, no collision matrix computed.") collision_matrix = SparseMatrix(np.array([[0]])) else: # Build the collision matrix, the number of iteration can change # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified) collision_matrix, collision_info.nb_iterations = final_collision_matrix( sax=rdd_sax, number_of_iterations=collision_info.nb_iterations, index_selected=collision_info.index, word_len=sax_info.paa, spark_ctx=spark_ctx) # *collision_matrix* is a sparse matrix : light in memory # Give the result of the Equation 9 eq9_result = equation9(number_of_sequences=number_of_sequences, size_alphabet=sax_info.alphabet_size, size_word=sax_info.paa, errors=collision_info.errors, index_selected=collision_info.index, iterations=collision_info.nb_iterations) sax = rdd_sax.collect() paa_result = np.transpose(paa_result) distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size, mindist_lookup_table=mindist_lookup_table, alphabet_size=sax_info.alphabet_size, sax=sax, radius=recognition_info.radius, collision_matrix=collision_matrix) LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result, collision_info.errors) # Check the eq9_result with min_value if eq9_result < recognition_info.min_value: LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value) eq9_result = recognition_info.min_value if eq9_result < 1: LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1") eq9_result = 1 # find the motif neighborhood by using the largest value cells in the collision matrix if recognition_info.is_algo_method_global is True: algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info) else: algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info) # Give the results with the names of sequences and not their number in the collision matrix algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result) algo_result = result_on_pattern_form(algo_result) # Give the alphabet used in the SAX algorithm alphabet = start_alphabet(sax_info.alphabet_size) result = {'patterns': algo_result, 'break_points': breakpoints, 'disc_break_points': alphabet} if spark_ctx is not None: ScManager.stop() LOGGER.info("Ended Spark session.") return result
def calc_quality_stats(ts_list, compute_value=True, compute_time=True, chunk_size=75000, force_save=True): """ Compute the quality statistics Returns a dict as follow { "TSUIDx" : { "MetadataX": ValueX, ... }, ... } Don't override default chunk_size unless you know what you are doing. It defines the number of points in a single chunk (assuming th TS is periodic) Use it only for performances purposes :param ts_list: List of TSUID to work onto :type ts_list: list :param compute_value: boolean indicating to compute metadata related to value :type compute_value: bool :param compute_time: boolean indicating to compute metadata related to time :type compute_time: bool :param chunk_size: (Advanced usage) Override the chunk size :type chunk_size: int :param force_save: Save metadata even if already present (default True) :type force_save: bool :return: Tuple composed of the input ts list and a dict having TSUID as key and a value being sub-dict where key is metadata name :rtype: tuple dict """ if not compute_value and not compute_time: LOGGER.error("You shall compute at least one set of metadata.") raise ValueError("You shall compute at least one set of metadata") try: # Convert tsuid_list [{tsuid:x, fid:x},...] to tsuid_list [tsuid,...] tsuid_list = [x['tsuid'] for x in ts_list] except TypeError: # Already a tsuid_list. No change tsuid_list = ts_list LOGGER.info('Computing Quality stats for %s TS', len(tsuid_list)) # Get all metadata md_list = IkatsApi.md.read(ts_list=tsuid_list) # Initialize results results = {} for tsuid in tsuid_list: results[tsuid] = {} try: # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() results = {} for index, tsuid in enumerate(tsuid_list): LOGGER.info('Processing Quality stats for TS %s (%s/%s)', tsuid, index, len(tsuid_list)) # Generating information about TSUID chunks # ([chunk_index, sd, ed], ...) ts_info = [] for chunk_index in range( _ts_chunk_count(tsuid=tsuid, md_list=md_list, chunk_size=chunk_size)): ts_info.append( _ts_chunk(tsuid=tsuid, index=chunk_index, md_list=md_list, chunk_size=chunk_size)) # Parallelizing information to work with spark # Each chunk can be computed separately, so divided into len(chunks) partitions rdd_ts_info = spark_context.parallelize(ts_info, max(8, len(ts_info))) # RDD containing the list of points values for every chunk of a TSUID # (without timestamps): # ([chunk_index, [[timestamp, value], ...], ...) rdd_ts_dps = rdd_ts_info \ .map(lambda x: (x[0], _ts_read(tsuid=tsuid, start_date=x[1], end_date=x[2]))) # This RDD is used multiple times, caching it to speed up rdd_ts_dps.cache() if compute_value: # Compute metadata related to "value" information result = calc_qual_stats_value(tsuid, rdd_ts_dps, force_save=force_save) # Append to final results if tsuid in results: results[tsuid].update(result[tsuid]) else: results.update(result) if compute_time: # Compute metadata related to "time" information result = calc_qual_stats_time(tsuid, rdd_ts_dps, force_save=force_save) # Append to final results if tsuid in results: results[tsuid].update(result[tsuid]) else: results.update(result) # We don't need the cache anymore rdd_ts_dps.unpersist() except Exception as cause: raise IkatsException("Quality stats failure with ...", cause) finally: ScManager.stop() return ts_list, results
def unwrap_ts_list(ts_list, unit=TSUnit.Radians, discontinuity=None, fid_pattern="%(fid)s__unwrap", use_spark=True): """ Unwrap a list of TS by changing deltas between values to 2*discontinuity complement. Unwrap phase of each TS composing the dataset :param ts_list: list of TSUID to unwrap :param unit: TS unit : "Degrees" or "Radians" (default) :param discontinuity: Maximum discontinuity between values. :param fid_pattern: Pattern of the new FID ('%(fid)s' will be replaced by original FID) :param use_spark: Set to True to use spark. True is default :type ts_list: list :type unit: str or TSUnit :type discontinuity: float or None :type fid_pattern: str :type use_spark: bool :return: a new ts_list :rtype: list :raises TypeError: if input is not well formatted """ if not isinstance(ts_list, list) or len(ts_list) == 0: raise TypeError("ts_list shall be a list having at least one TS") if discontinuity is None: raise ValueError("Discontinuity is not filled") results = [] if use_spark: # Get Spark Context spark_context = ScManager.get() try: # Parallelize 1 TS = 1 partition rdd_ts_list = spark_context.parallelize(ts_list, len(ts_list)) rdd_results = rdd_ts_list.map( lambda x: unwrap_tsuid(tsuid=x["tsuid"], fid=x["funcId"], fid_pattern=fid_pattern, discontinuity=discontinuity, unit=unit)) # Persist data to not recompute them again # (Functional identifier reservation called multiple times through IkatsApi.ts.create_ref) rdd_results.cache() timings = rdd_results.map(lambda x: x[1]).reduce( lambda x, y: x + y) results = rdd_results.map(lambda x: x[0]).collect() rdd_results.unpersist() LOGGER.debug("Unwrapping %s TS using Spark: %s", len(ts_list), timings.stats()) finally: # Stop the context ScManager.stop() else: timings = Timings() for item in ts_list: tsuid = item["tsuid"] fid = item["funcId"] result, tsuid_timings = unwrap_tsuid(tsuid=tsuid, fid=fid, fid_pattern=fid_pattern, discontinuity=discontinuity, unit=unit) results.append(result) timings += tsuid_timings LOGGER.debug("Unwrapping %s TS: %s", len(ts_list), timings.stats()) return results
def spark_ccf(tdm, tsuid_list_or_dataset, lag_max=None, tsuids_out=False, cut_ts=False): """ This function calculates the maximum of the cross correlation function matrix between all ts in **tsuid_list_or_dataset** IN A DISTRIBUTED MODE (using spark) Cross correlation is a correlation between two timeseries whose one is delayed of successive lag values. Result of CCF is a timeseries (correlation function of the lag between timeseries). This function keep the maximum value of the CCF function generated and pull it in the matrix for corresponding timeseries couple. :returns: a string matrix (whose size is equal to the number of tsuids in tsuid_list_or_dataset plus one line and one column for headers) :rtype: ndarray :param tdm: Temporal Data Manager client :param tsuid_list_or_dataset: list of identifiers of the time series or dataset name :param lag_max: maximum lag between timeseries (cf. _ccf function for more details) :param tsuids_out: True to fill headers with tsuids False to fill headers with functional ids :param cut_ts: Cut the TS list to the min-length if set to True :type tdm: TemporalDataMgr :type tsuid_list_or_dataset: list of str or str :type lag_max: positive int :type tsuids_out: boolean :type cut_ts: bool :raises TypeError: if tdm is not a TemporalDataMgr :raises TypeError: if tsuid_list_or_dataset is not a list nor a string :raises TypeError: if tsuids_out is not a boolean """ if type(tdm) is not TemporalDataMgr: raise TypeError("tdm must be a TemporalDataMgr") if type(tsuid_list_or_dataset) is not list and type( tsuid_list_or_dataset) is not str: raise TypeError( "tsuid_list_or_dataset must be a list of string OR a string") if type(tsuids_out) is not bool: raise TypeError("tsuids_out must be a boolean") if type(cut_ts) is not bool: raise TypeError("cut_ts must be a boolean") if type(tsuid_list_or_dataset) is list: # input is a list of tsuid tsuid_list = tsuid_list_or_dataset else: # input is a dataset name dataset = tdm.get_data_set(tsuid_list_or_dataset) tsuid_list = dataset['ts_list'] if tsuids_out: ts_list = tsuid_list else: ts_list = __retrieve_func_id(tdm, tsuid_list) md_list = tdm.get_meta_data(tsuid_list) # initialize size of time series min_ts_size = md_list[tsuid_list[0]]['qual_nb_points'] if cut_ts: for ts in tsuid_list: min_ts_size = min(min_ts_size, md_list[ts]['qual_nb_points']) else: # check time series have same length for ts in tsuid_list: size_ts = md_list[ts]['qual_nb_points'] if size_ts != min_ts_size: raise ValueError('time series do not have same length') # Create or get a spark Context sc = ScManager.get() # Build the RDD with TSUIDS rdd = sc.parallelize(tsuid_list) # Create a broadcast for spark jobs broadcast = sc.broadcast({ "host": tdm.host, "port": tdm.port, "size_of_ts": min_ts_size, "lag_max": lag_max }) # Create an accumulator to store the results of the spark workers accumulator = sc.accumulator(dict(), ListAccumulatorParam()) def run_ccf_spark(working_tsuids): """ Method called by spark job :param working_tsuids: rdd item :type working_tsuids: tuple """ # cross correlation is equal to 1 if timeseries are the same if working_tsuids[0] == working_tsuids[1]: result = 1 else: spark_tdm = TemporalDataMgr(host=broadcast.value['host'], port=broadcast.value['port']) result = __run_max_ccf_ts_list(tdm=spark_tdm, tsuids=list(working_tsuids), size=int( broadcast.value['size_of_ts']), lag_max=broadcast.value['lag_max']) accumulator.add({";".join(list(working_tsuids)): result}) # Get TS content and perform ccf calculation using spark distribution to increase performance # for each element of rdd which is a couple of timeseries # the list of couples is first sorted then duplicates are suppressed to avoid doing same calculation # as for (a,b) and (b,a) rdd.cartesian(rdd).map( lambda x: tuple(sorted(list(x)))).distinct().foreach(run_ccf_spark) # Retrieving result from accumulator to fill matrix result ts_nb = len(tsuid_list) matrix_corr = np.zeros((ts_nb, ts_nb)) for str_couple in accumulator.value: couple = str_couple.split(';') matrix_corr[ tsuid_list.index(couple[0]), tsuid_list.index(couple[1])] = accumulator.value[str_couple] matrix_corr[ tsuid_list.index(couple[1]), tsuid_list.index(couple[0])] = accumulator.value[str_couple] # fill final matrix with headers matrix = __fill_headers_to_final_matrix(matrix_corr, ts_list) return matrix
def correlation_ts_list_loop(ts_list, corr_method, context_meta, variable_meta='metric', config=ConfigCorrelationLoop( the_num_partitions=24, the_point_cache_size=50e6, the_digits_number=4)): """ Computes the correlations between timeseries selected by observed variables and contexts. The observed contexts are defined by the context_meta argument. The variables are defined by variable_meta argument. Assumed: - Each context has a list of distinct variables. - Each timeseries is uniquely associated to one context and one variable. Example with Airbus data: - the *context* is a flight in an Airbus dataset of timeseries. - the *variables* could be metric 'WS1', metric 'WS2' etc. This algorithm is spark-distributed on the cluster. Spark summary ************* - **step 1** The driver prepares a set of configured tuples: each tuple is configured for one context, and has a list of (variable, timeseries reference). Timeseries references are tsuids. - **step 2** A RDD is initialized from the set of cells **'configured tuples'** - **step 3** A new RDD is computed from step 2: each cell **'configured tuple'** is transformed into list of **'correlation inputs'**: this cell is prepared to be processed by the correlation method, for a subpart of the correlation matrice computed for one context At this step, each task task executes: *_spark_combine_pairs()* - **step 4** A new RDD is computed as set of **'correlation result'** cells from cells **'correlations inputs'**: each task will read timeseries pairs, compute the correlation result from selected method (Pearson, ...) At this step, each task task executes: *_spark_correlate_pairs()* - **step 5**: aggregates **'correlation result'** by variable pairs into RDD of **'aggregated correlations'** cells. Each task will 1. creates and saves low-level results CorrelationsByContext into IKATS database, as JSON content. .. seealso:: the JSON is described in the ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict() 2. returns **'aggregated correlation'** cells providing - pair of variable indexes - aggregated values: Mean, Variance - saved reference of CorrelationsByContext At this step, each task executes: *_spark_build_corrs_by_context()* - **step 6**: the driver collects the RDD of **'aggregated correlations'**, and computes the high-level result, which is a CorrelationDataset. Finally the JSON generated by CorrelationDataset is returned. :param ts_list: selected timeseries list on which are computed the correlations :type ts_list: list :param corr_method: the method computing the correlation between 2 timeseries. The value must be in CORRELATION_METHODS. Choose PEARSON to apply the pearson correlation. :type corr_method: str :param context_meta: name of the metadata identifying each observed context, where correlations are computed. .. note:: this metadata shall exist for each timeseries, otherwise the latter will be ignored. With Airbus example: 'FlightIdentifier' identifies the flight as observed context. :type context_meta: str :param variable_meta: Optional, with default value 'metric', the name of the metadata identifying the variables. .. note:: this metadata shall exist for each timeseries, otherwise the latter will be ignored. The metadata values will be sorted in a list providing the effective indexes of matrices: the correlation matrix: the N-th index is reserved to the timeseries having the N-th value of this metadata in alphanumeric order. It is advised to keep the default value: this advanced argument must provide distinct indexes for each timeseries under same observed context. :type variable_meta: str :return: JSON-friendly dict grouping - Matrix of means of correlations (see step5) - Matrix of variances of correlations (see step5) - Matrix of references to the JSON content of CorrelationByContext (see step 5) .. seealso:: detailed JSON structure in ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict() :rtype: dict as json-friendly structure for json library :raise exception: IkatsException when an error occurred while processing the correlations. """ sc = None try: LOGGER.info("Starting correlation loop ...") LOGGER.info(" - observed contexts based on: %s", context_meta) LOGGER.info(" - variables ordered by: %s", variable_meta) # Check parameters corr_func = CORRELATION_FUNCTIONS.get(corr_method, None) if corr_func is None: msg = "Unknown correlation method from CORRELATION_FUNCTIONS: corr_method={}" raise IkatsException(msg.format(corr_method)) if type(ts_list) is not list: msg = "Unexpected type: list expected for ts_list={}" raise IkatsException(msg.format(msg.format(ts_list))) if type(context_meta) is not str or len(context_meta) == 0: msg = "Unexpected arg value: defined str is expected for context_meta={}" raise IkatsException(msg.format(msg.format(context_meta))) if type(variable_meta) is not str or len(variable_meta) == 0: msg = "Unexpected arg value: defined str is expected for variable_meta={}" raise IkatsException(msg.format(msg.format(variable_meta))) # Hyp: the metadata part can be loaded from the driver ts_metadata_dict = IkatsApi.md.read(ts_list) # Note: the algorithm discards the variables X without Corr(X,Y) for Y different from X # but when X is retained, the final result will present the Corr(X,X) beside the Corr(X,Y) corr_loop_config, sorted_contexts, sorted_variables = _initialize_config_from_meta( ts_metadata_dict, context_meta=context_meta, variable_meta=variable_meta) LOGGER.info("- sorted_contexts=%s", sorted_contexts) LOGGER.info("- sorted_variables=%s", sorted_variables) nb_contexts = len(sorted_contexts) if nb_contexts * len(sorted_variables) == 0: # Algo simply return empty result when there is no variable or no context consistent # # - case 1: case when there is no computable Corr(X, Y) # where variables X and Y are different for the same context # - case 2: missing metadata for context_name => no context # - case 3: missing metadata for ordering_meta => no variable # LOGGER.warning("Empty result from selection=%s", ts_list) obj_empty_result = CorrelationDataset() obj_empty_result.set_contexts(contexts=sorted_contexts, meta_identifier=context_meta) obj_empty_result.set_variables(labels=sorted_variables) obj_empty_result.add_matrix(matrix=[], desc_label="Empty Mean correlation") obj_empty_result.add_matrix( matrix=[], desc_label="Empty Variance correlation") obj_empty_result.add_rid_matrix(matrix=[]) return obj_empty_result.get_json_friendly_dict() # Computes the number of matrix chunks # one matrix chunk will be handled by one task at # ------------------------------------- if nb_contexts < config.num_partitions: # Case when there are fewer contexts than recommended partitions: # - the computing of one matrix is split into several chunks nb_matrix_blocks = ceil(float(config.num_partitions) / nb_contexts) else: nb_matrix_blocks = 1 LOGGER.info("- number of matrix blocks by context=%s", nb_matrix_blocks) # Computes the timeseries LRU cache size used by one task # ------------------------------------------------------- # 1/ retrieve nb points for each TS, default value is assumed to be 1e6 in order to be robust # in case 'qual_nb_points' is not available, (should not happen ...) defined_nb_points = [ int(v.get('qual_nb_points', 1e6)) for v in ts_metadata_dict.values() ] # 2/ evaluate the number of points by one task carrying one matrice chunk total_nb_points_by_ctx = sum( defined_nb_points) / nb_contexts / nb_matrix_blocks if config.the_point_cache_size >= total_nb_points_by_ctx: # the best condition: # system will memorize in the cache every loaded ts under the same matrice ts_cache_size = len(sorted_variables) else: # the case when it is required to limit the number TS memorized in the cache, # under the same row of correlation matrice # Note: len(sorted_variables) == max size of correlation row == dim matrice ts_cache_size = config.the_point_cache_size / total_nb_points_by_ctx * len( sorted_variables) ts_cache_size = ceil(max(2.0, ts_cache_size)) LOGGER.info("- ts_cache_size=%s", ts_cache_size) # release ts_metadata_dict from memory ts_metadata_dict = None sc = ScManager.get() # Spark_step_1: initialize the RDD # ------------ # OUTPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] ) rdd_initial_config = sc.parallelize(corr_loop_config, config.num_partitions) # Spark_step_2: combinate the pairs of timeseries by contexts and by chunks # ------------ # INPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] ) # OUTPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] ) # # where <pair X_Y> is ((<var X index>, <tsuid X> ), (<var Y index>, <tsuid Y>)) # # PROCESS: computes the cartesian product and split the list of pairs into smaller-sized lists # rdd_var_combinations = rdd_initial_config.flatMap( lambda x: _spark_combine_pairs(context=x[0], variables=x[1], nb_corr_matrix_blocks= nb_matrix_blocks)) if nb_matrix_blocks > 1: # reshuffles all the data over the cluster ... rdd_var_combinations = rdd_var_combinations.repartition( nb_contexts * nb_matrix_blocks) # Spark_step_3: computes the correlations # ------------ # INPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] ) # OUTPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) # # where # <computed corr X_Y> is (<context>, (<tsuid X>, <tsuid Y>), correlation) # # PROCESS: computes the correlations on the timeseries associated to the variables # rdd_correlations = rdd_var_combinations.flatMap( lambda x: _spark_correlate_pairs(context=x[0], var_pairs=x[1], corr_method=corr_method, ts_cache_size=ts_cache_size)) # generates the parent_id: # presently this identifier may be used by Postgres admin, # to group the low-level results attached to the same high-level result # => at the moment a label including a timestamp is generated obj_result = CorrelationDataset() parent_id = obj_result.get_id() def r_append(data, computed_corr): """ Append computed correlation to data :param data: :param computed_corr: :return: """ data.append(computed_corr) return data def r_merge(one, two): """ Merge two to one :param one: :param two: :return: """ one.extend(two) return one # Spark_step_4: aggregate the correlations by pair of variables # ------------ # INPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) as described previously # # OUTPUT: RDD of ( (<var X index>, <var Y index>), list of tuples: # (<context index>, (tsuid_X, tsuid_Y), <correlation result> ) # ) # PROCESS: aggregates by key=(<var X index>, <var Y index>) the correlation information profiles, # enhanced with tsuid pairs # rdd_agg_correlations = rdd_correlations.aggregateByKey( zeroValue=[], seqFunc=r_append, combFunc=r_merge) # Spark_step_5: # ------------ # INPUT: RDD of ( (<var X index>, <var Y index>), list of tuples: # (<context index>, (tsuid_X, tsuid_Y), <correlation result> ) # ) # # OUTPUT: RDD of ( ( <var X index>, <var Y index>), <low-level Result ID>, <Mean correlation>, <Var correlation> # ) # PROCESS: - creates and saves aggregated low-level results as CorrelationsByContext # - computes Mean and Variance of low-level results # - returns summarized info: Mean+Variance+ result ID rdd_results_corr_by_context = \ rdd_agg_correlations.map(lambda x: (_spark_build_corrs_by_context(variables=x[0], agg_ctx_ts_corr=x[1], desc_context=context_meta, sorted_variables=sorted_variables, sorted_contexts=sorted_contexts, corr_method=corr_method, parent_id=parent_id, ndigits=config.the_digits_number))) # Spark_step_6: # ------------ # # 6.1: collects # # INPUT: RDD of ( [ <var X index>, <var Y index>], <processdata ID>, <Mean(corr)>, <Var(corr)> # ) # # OUTPUT: collected list # # PROCESS: collects high-level results # collected_results_corr = rdd_results_corr_by_context.collect() # 6.2: prepare the result # # - Encodes the returned json-friendly content from the collected high-level results # - returns the result # matrix_mean = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=1.0, default_value_other=None) matrix_variance = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=0.0, default_value_other=None) matrix_id = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=None, default_value_other=None) for var_index_pair, data_oid, mean, variance in collected_results_corr: var_index_row = var_index_pair[0] var_index_col = var_index_pair[1] # required: recomputes the range of cell in its row # triangular matrix => cell(i,j) is at position j-i of the row triangular_matrix[i] matrix_mean[var_index_row][var_index_col - var_index_row] = mean matrix_variance[var_index_row][var_index_col - var_index_row] = variance matrix_id[var_index_row][var_index_col - var_index_row] = data_oid obj_result.set_contexts(contexts=sorted_contexts, meta_identifier=context_meta) obj_result.set_variables(sorted_variables) obj_result.add_matrix(matrix=matrix_mean, desc_label="Mean Correlation") obj_result.add_matrix(matrix=matrix_variance, desc_label="Variance") obj_result.add_rid_matrix(matrix_id) LOGGER.info("... ended correlation loop.") return obj_result.get_json_friendly_dict() except Exception: LOGGER.error("... ended correlation loop with error.") raise IkatsException("Failed execution: correlation_ts_loop()") finally: if sc: ScManager.stop()
def test_sliding_window_recovery(self): """ Testing the recovery parameter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[1, 1], alphabet_size=6) ts_name = ["linear_time_serie"] spark_ctx = ScManager.get() # Test with recovery = 0.5 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # 2 sequences in the timeseries => 3 sequences at the end self.assertEqual(len(result), 3) # Test with MAX recovery # recovery = 1 (the maximum : 100 % <=> the next window start one point to the right) sax_info.recovery = 1.0 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # remember that in 'sliding_window' function, we call 'get_ts_mock(ts_name)[0]' ts = get_ts_mock(ts_name)[0] ts_val_0 = list(ts[0:6][:, 1]) ts_val_1 = list(ts[6:12][:, 1]) timestamp_0 = list(ts[0:6][:, 0]) timestamp_1 = list(ts[6:12][:, 0]) # Check the timestamp and the values of the two sequences # result[i] = (key, list([timestamps, values],[,],...)) # check ts value condition = (np.all(result[i][1][:, 1] in ts_val_0 for i in range(len(result))) or np.all(result[i][1][:, 1] in ts_val_1 for i in range(len(result)))) self.assertTrue(condition) # check timestamps condition = (np.all(result[i][1][:, 0] in timestamp_0 for i in range(len(result))) or np.all(result[i][1][:, 0] in timestamp_1 for i in range(len(result)))) self.assertTrue(condition) # Test with MINIMUM recovery # recovery = 0 (no recovery) sax_info.recovery = 0.01 result2, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result2 = result2.collect() # 2 sequences in the timeseries => 2 sequences self.assertEqual(len(result2), 2)