Beispiel #1
0
    def _run_all_in_master_memory(self, method):
        """
        Run the spark pearson correlation by loading all the TS content (ie. values) in master memory

        Each coefficient will be computed by a worker (Spark decides the best choice to apply)
        """

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Get TS content
        rdd_content = self._get_ts(spark_context)

        # Job distribution is made by Statistics.corr (Spark correlation matrix calculation)
        self.results = Statistics.corr(rdd_content, method=method)

        ScManager.stop()
Beispiel #2
0
def correlation_ts_list_loop(ts_list,
                             corr_method,
                             context_meta,
                             variable_meta='metric',
                             config=ConfigCorrelationLoop(
                                 the_num_partitions=24,
                                 the_point_cache_size=50e6,
                                 the_digits_number=4)):
    """
    Computes the correlations between timeseries selected by observed variables and contexts.

    The observed contexts are defined by the context_meta argument.
    The variables are defined by variable_meta argument.

    Assumed:
      - Each context has a list of distinct variables.
      - Each timeseries is uniquely associated to one context and one variable.

    Example with Airbus data:
      - the *context* is a flight in an Airbus dataset of timeseries.
      - the *variables* could be metric 'WS1', metric 'WS2' etc.

    This algorithm is spark-distributed on the cluster.

    Spark summary
    *************

      - **step 1** The driver prepares a set of configured tuples: each tuple is configured for one context,
               and has a list of (variable, timeseries reference). Timeseries references are tsuids.

      - **step 2** A RDD is initialized from the set of cells **'configured tuples'**

      - **step 3** A new RDD is computed from step 2: each cell **'configured tuple'** is transformed into list of
        **'correlation inputs'**: this cell is prepared to be processed by the correlation method, for a
        subpart of the correlation matrice computed for one context

        At this step, each task task executes: *_spark_combine_pairs()*

      - **step 4** A new RDD is computed as set of **'correlation result'** cells from cells **'correlations inputs'**:
        each task will read timeseries pairs, compute the correlation result from selected method (Pearson, ...)

        At this step, each task task executes: *_spark_correlate_pairs()*

      - **step 5**: aggregates **'correlation result'** by variable pairs into RDD of
        **'aggregated correlations'** cells. Each task will

        1. creates and saves low-level results CorrelationsByContext into IKATS database, as JSON content.

          .. seealso:: the JSON is described in the
            ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

        2. returns **'aggregated correlation'** cells providing

          - pair of variable indexes
          - aggregated values: Mean, Variance
          - saved reference of CorrelationsByContext

        At this step, each task executes: *_spark_build_corrs_by_context()*

      - **step 6**: the driver collects the RDD of **'aggregated correlations'**, and computes the high-level result,
        which is a CorrelationDataset.

        Finally the JSON generated by CorrelationDataset is returned.

    :param ts_list: selected timeseries list on which are computed the correlations
    :type ts_list: list
    :param corr_method: the method computing the correlation between 2 timeseries.

      The value must be in CORRELATION_METHODS.

      Choose PEARSON to apply the pearson correlation.
    :type corr_method: str
    :param context_meta: name of the metadata identifying each observed context,
      where correlations are computed.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      With Airbus example: 'FlightIdentifier' identifies the flight as observed context.

    :type context_meta: str
    :param variable_meta: Optional, with default value 'metric',
      the name of the metadata identifying the variables.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      The metadata values will be sorted in a list providing the effective indexes of matrices:
      the correlation matrix: the N-th index is reserved to the timeseries having the N-th value of
      this metadata in alphanumeric order.

      It is advised to keep the default value: this advanced argument must provide distinct indexes for each
      timeseries under same observed context.

    :type variable_meta: str
    :return: JSON-friendly dict grouping

      - Matrix of means of correlations (see step5)

      - Matrix of variances of correlations (see step5)

      - Matrix of references to the JSON content of CorrelationByContext (see step 5)

      .. seealso:: detailed JSON structure in
        ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

    :rtype: dict as json-friendly structure for json library
    :raise exception: IkatsException when an error occurred while processing the correlations.
    """

    sc = None

    try:
        LOGGER.info("Starting correlation loop ...")
        LOGGER.info(" - observed contexts based on: %s", context_meta)
        LOGGER.info(" - variables ordered by: %s", variable_meta)

        # Check parameters
        corr_func = CORRELATION_FUNCTIONS.get(corr_method, None)
        if corr_func is None:
            msg = "Unknown correlation method from CORRELATION_FUNCTIONS: corr_method={}"
            raise IkatsException(msg.format(corr_method))

        if type(ts_list) is not list:
            msg = "Unexpected type: list expected for ts_list={}"
            raise IkatsException(msg.format(msg.format(ts_list)))

        if type(context_meta) is not str or len(context_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for context_meta={}"
            raise IkatsException(msg.format(msg.format(context_meta)))
        if type(variable_meta) is not str or len(variable_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for variable_meta={}"
            raise IkatsException(msg.format(msg.format(variable_meta)))

        # Hyp: the metadata part can be loaded from the driver

        ts_metadata_dict = IkatsApi.md.read(ts_list)

        # Note: the algorithm discards the variables X without Corr(X,Y) for Y different from X
        #       but when X is retained, the final result will present the Corr(X,X) beside the Corr(X,Y)
        corr_loop_config, sorted_contexts, sorted_variables = _initialize_config_from_meta(
            ts_metadata_dict,
            context_meta=context_meta,
            variable_meta=variable_meta)

        LOGGER.info("- sorted_contexts=%s", sorted_contexts)
        LOGGER.info("- sorted_variables=%s", sorted_variables)

        nb_contexts = len(sorted_contexts)

        if nb_contexts * len(sorted_variables) == 0:
            # Algo simply return empty result when there is no variable or no context consistent
            #
            # - case 1: case when there is no computable Corr(X, Y)
            #           where variables X and Y are different for the same context
            # - case 2: missing metadata for context_name => no context
            # - case 3: missing metadata for ordering_meta => no variable
            #
            LOGGER.warning("Empty result from selection=%s", ts_list)
            obj_empty_result = CorrelationDataset()
            obj_empty_result.set_contexts(contexts=sorted_contexts,
                                          meta_identifier=context_meta)
            obj_empty_result.set_variables(labels=sorted_variables)
            obj_empty_result.add_matrix(matrix=[],
                                        desc_label="Empty Mean correlation")
            obj_empty_result.add_matrix(
                matrix=[], desc_label="Empty Variance correlation")
            obj_empty_result.add_rid_matrix(matrix=[])

            return obj_empty_result.get_json_friendly_dict()

        # Computes the number of matrix chunks
        # one matrix chunk will be handled by one task at
        # -------------------------------------
        if nb_contexts < config.num_partitions:
            # Case when there are fewer contexts than recommended partitions:
            # - the computing of one matrix is split into several chunks
            nb_matrix_blocks = ceil(float(config.num_partitions) / nb_contexts)
        else:
            nb_matrix_blocks = 1

        LOGGER.info("- number of matrix blocks by context=%s",
                    nb_matrix_blocks)

        # Computes the timeseries LRU cache size used by one task
        # -------------------------------------------------------
        # 1/ retrieve nb points for each TS, default value is assumed to be 1e6 in order to be robust
        # in case 'qual_nb_points' is not available, (should not happen ...)
        defined_nb_points = [
            int(v.get('qual_nb_points', 1e6))
            for v in ts_metadata_dict.values()
        ]
        # 2/ evaluate the number of points by one task carrying one matrice chunk
        total_nb_points_by_ctx = sum(
            defined_nb_points) / nb_contexts / nb_matrix_blocks
        if config.the_point_cache_size >= total_nb_points_by_ctx:
            # the best condition:
            # system will memorize in the cache every loaded ts under the same matrice
            ts_cache_size = len(sorted_variables)
        else:
            # the case when it is required to limit the number TS memorized in the cache,
            # under the same row of correlation matrice
            # Note: len(sorted_variables) == max size of correlation row == dim matrice
            ts_cache_size = config.the_point_cache_size / total_nb_points_by_ctx * len(
                sorted_variables)
            ts_cache_size = ceil(max(2.0, ts_cache_size))
        LOGGER.info("- ts_cache_size=%s", ts_cache_size)

        # release ts_metadata_dict from memory
        ts_metadata_dict = None

        sc = ScManager.get()

        # Spark_step_1: initialize the RDD
        # ------------
        # OUTPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )

        rdd_initial_config = sc.parallelize(corr_loop_config,
                                            config.num_partitions)

        # Spark_step_2: combinate the pairs of timeseries by contexts and by chunks
        # ------------
        # INPUT:  RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )
        # OUTPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        #
        #    where <pair X_Y> is ((<var X index>, <tsuid X> ), (<var Y index>, <tsuid Y>))
        #
        # PROCESS: computes the cartesian product and split the list of pairs into smaller-sized lists
        #
        rdd_var_combinations = rdd_initial_config.flatMap(
            lambda x: _spark_combine_pairs(context=x[0],
                                           variables=x[1],
                                           nb_corr_matrix_blocks=
                                           nb_matrix_blocks))

        if nb_matrix_blocks > 1:
            # reshuffles all the data over the cluster ...
            rdd_var_combinations = rdd_var_combinations.repartition(
                nb_contexts * nb_matrix_blocks)

        # Spark_step_3: computes the correlations
        # ------------
        # INPUT:  RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> )
        #
        #  where
        #    <computed corr X_Y> is (<context>, (<tsuid X>, <tsuid Y>), correlation)
        #
        # PROCESS: computes the correlations on the timeseries associated to the variables
        #
        rdd_correlations = rdd_var_combinations.flatMap(
            lambda x: _spark_correlate_pairs(context=x[0],
                                             var_pairs=x[1],
                                             corr_method=corr_method,
                                             ts_cache_size=ts_cache_size))

        # generates the parent_id:
        #   presently this identifier may be used by Postgres admin,
        #   to group the low-level results attached to the same high-level result
        #   => at the moment a label including a timestamp is generated
        obj_result = CorrelationDataset()
        parent_id = obj_result.get_id()

        def r_append(data, computed_corr):
            """
            Append computed correlation to data
            :param data:
            :param computed_corr:
            :return:
            """
            data.append(computed_corr)
            return data

        def r_merge(one, two):
            """
            Merge two to one
            :param one:
            :param two:
            :return:
            """
            one.extend(two)
            return one

        # Spark_step_4: aggregate the correlations by pair of variables
        # ------------
        # INPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) as described previously
        #
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        # PROCESS: aggregates by key=(<var X index>, <var Y index>) the correlation information profiles,
        #          enhanced with tsuid pairs
        #
        rdd_agg_correlations = rdd_correlations.aggregateByKey(
            zeroValue=[], seqFunc=r_append, combFunc=r_merge)

        # Spark_step_5:
        # ------------
        # INPUT: RDD of  ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        #
        # OUTPUT: RDD of ( ( <var X index>, <var Y index>), <low-level Result ID>, <Mean correlation>, <Var correlation>
        #                )
        # PROCESS: - creates and saves aggregated low-level results as CorrelationsByContext
        #          - computes Mean and Variance of low-level results
        #          - returns summarized info: Mean+Variance+ result ID
        rdd_results_corr_by_context = \
            rdd_agg_correlations.map(lambda x: (_spark_build_corrs_by_context(variables=x[0],
                                                                              agg_ctx_ts_corr=x[1],
                                                                              desc_context=context_meta,
                                                                              sorted_variables=sorted_variables,
                                                                              sorted_contexts=sorted_contexts,
                                                                              corr_method=corr_method,
                                                                              parent_id=parent_id,
                                                                              ndigits=config.the_digits_number)))

        # Spark_step_6:
        # ------------
        #
        # 6.1: collects
        #
        # INPUT: RDD of  ( [ <var X index>, <var Y index>], <processdata ID>, <Mean(corr)>, <Var(corr)>
        #                )
        #
        # OUTPUT: collected list
        #
        # PROCESS:  collects high-level results
        #
        collected_results_corr = rdd_results_corr_by_context.collect()

        # 6.2: prepare the result
        #
        #  - Encodes the returned json-friendly content from the collected high-level results
        #  - returns the result
        #
        matrix_mean = get_triangular_matrix(dim=len(sorted_variables),
                                            default_value_diag=1.0,
                                            default_value_other=None)

        matrix_variance = get_triangular_matrix(dim=len(sorted_variables),
                                                default_value_diag=0.0,
                                                default_value_other=None)

        matrix_id = get_triangular_matrix(dim=len(sorted_variables),
                                          default_value_diag=None,
                                          default_value_other=None)

        for var_index_pair, data_oid, mean, variance in collected_results_corr:
            var_index_row = var_index_pair[0]
            var_index_col = var_index_pair[1]
            # required: recomputes the range of cell in its row
            # triangular matrix => cell(i,j) is at position j-i of the row triangular_matrix[i]
            matrix_mean[var_index_row][var_index_col - var_index_row] = mean
            matrix_variance[var_index_row][var_index_col -
                                           var_index_row] = variance
            matrix_id[var_index_row][var_index_col - var_index_row] = data_oid

        obj_result.set_contexts(contexts=sorted_contexts,
                                meta_identifier=context_meta)

        obj_result.set_variables(sorted_variables)
        obj_result.add_matrix(matrix=matrix_mean,
                              desc_label="Mean Correlation")
        obj_result.add_matrix(matrix=matrix_variance, desc_label="Variance")
        obj_result.add_rid_matrix(matrix_id)

        LOGGER.info("... ended correlation loop.")
        return obj_result.get_json_friendly_dict()

    except Exception:
        LOGGER.error("... ended correlation loop with error.")
        raise IkatsException("Failed execution: correlation_ts_loop()")
    finally:
        if sc:
            ScManager.stop()
Beispiel #3
0
def compute_slope(ts_list,
                  fid_suffix="_slope",
                  chunk_size=75000,
                  save_new_ts=True):
    """
    Compute the slope of a list of timeseries using spark

    This implementation computes slope for one TS at a time in a loop.
    To know the details of the computation, see the corresponding method

    :param ts_list: list of TS. Each item is a dict composed of a TSUID and a functional id
    :param fid_suffix: Functional identifier suffix of the final timeseries
    :param chunk_size: Number of points per chunk (assuming the TS is periodic)
    :param save_new_ts: True (default) if TS must be saved to database

    :type ts_list: list of dict
    :type fid_suffix: str
    :type chunk_size: int
    :type save_new_ts: bool

    :return: the new list of derived TS (same order as input)
    :rtype: list of dict

    :raise TypeError: if ts_list type is incompatible
    """

    # Check inputs
    if not isinstance(ts_list, list):
        raise TypeError("ts_list shall be a list")
    if len(ts_list) == 0:
        raise TypeError("ts_list must have at least one element")

    LOGGER.info('Computing Slope for %s TS', len(ts_list))

    tsuid_list = ts_list
    try:
        # Extract TSUID from ts_list
        tsuid_list = [x['tsuid'] for x in ts_list]
    except Exception:
        # Already a tsuid_list.
        # Getting the functional id for each ts
        ts_list = [{
            'tsuid': x,
            'funcId': IkatsApi.fid.read(x)
        } for x in ts_list]

    # Gather all metadata for the list of TS to compute slope
    md_list = IkatsApi.md.read(tsuid_list)

    # Results will be stored here
    results = []

    try:
        # Get Spark Context
        spark_context = ScManager.get()

        for index, tsuid in enumerate(tsuid_list):
            fid = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]
            LOGGER.info('Processing Slope for TS %s (%s/%s) (%s)', fid,
                        (index + 1), len(tsuid_list), tsuid)

            computed_tsuid, computed_fid = compute_slope_for_tsuid(
                spark_context=spark_context,
                fid=fid,
                fid_suffix=fid_suffix,
                tsuid=tsuid,
                md_list=md_list,
                chunk_size=chunk_size,
                save_new_ts=save_new_ts)

            # Append results to final results
            results.append({"tsuid": computed_tsuid, "funcId": computed_fid})
    except Exception:
        raise
    finally:
        # Stop spark context in all cases
        ScManager.stop()

    return results
Beispiel #4
0
def unwrap_ts_list(ts_list,
                   unit=TSUnit.Radians,
                   discontinuity=None,
                   fid_pattern="%(fid)s__unwrap",
                   use_spark=True):
    """
    Unwrap a list of TS by changing deltas between values to 2*discontinuity complement.
    Unwrap phase of each TS composing the dataset

    :param ts_list: list of TSUID to unwrap
    :param unit: TS unit : "Degrees" or "Radians" (default)
    :param discontinuity: Maximum discontinuity between values.
    :param fid_pattern: Pattern of the new FID ('%(fid)s' will be replaced by original FID)
    :param use_spark: Set to True to use spark. True is default

    :type ts_list: list
    :type unit: str or TSUnit
    :type discontinuity: float or None
    :type fid_pattern: str
    :type use_spark: bool

    :return: a new ts_list
    :rtype: list

    :raises TypeError: if input is not well formatted
    """

    if not isinstance(ts_list, list) or len(ts_list) == 0:
        raise TypeError("ts_list shall be a list having at least one TS")

    if discontinuity is None:
        raise ValueError("Discontinuity is not filled")

    results = []
    if use_spark:
        # Get Spark Context
        spark_context = ScManager.get()

        try:

            # Parallelize 1 TS = 1 partition
            rdd_ts_list = spark_context.parallelize(ts_list, len(ts_list))

            rdd_results = rdd_ts_list.map(
                lambda x: unwrap_tsuid(tsuid=x["tsuid"],
                                       fid=x["funcId"],
                                       fid_pattern=fid_pattern,
                                       discontinuity=discontinuity,
                                       unit=unit))

            # Persist data to not recompute them again
            # (Functional identifier reservation called multiple times through IkatsApi.ts.create_ref)
            rdd_results.cache()

            timings = rdd_results.map(lambda x: x[1]).reduce(
                lambda x, y: x + y)

            results = rdd_results.map(lambda x: x[0]).collect()

            rdd_results.unpersist()

            LOGGER.debug("Unwrapping %s TS using Spark: %s", len(ts_list),
                         timings.stats())
        finally:
            # Stop the context
            ScManager.stop()
    else:
        timings = Timings()
        for item in ts_list:
            tsuid = item["tsuid"]
            fid = item["funcId"]
            result, tsuid_timings = unwrap_tsuid(tsuid=tsuid,
                                                 fid=fid,
                                                 fid_pattern=fid_pattern,
                                                 discontinuity=discontinuity,
                                                 unit=unit)
            results.append(result)
            timings += tsuid_timings

        LOGGER.debug("Unwrapping %s TS: %s", len(ts_list), timings.stats())
    return results
Beispiel #5
0
def _resample(resampling_way,
              ts_list,
              resampling_period,
              adding_method=AddingMethod.LINEAR_INTERPOLATION,
              timestamp_position=TimestampPosition.BEG,
              aggregation_method=AggregationMethod.AVG,
              nb_points_by_chunk=50000,
              generate_metadata=False):
    """
    Function that effectively resamples (UP or DOWN according to resampling_way value) using Spark

    :param resampling_way: way of resampling (UP or DOWN)
    :type ts_list: ResamplingWay

    :param ts_list: list composing the TS information to resample [{'tsuid': xxx, 'funcId': yyy },...]
    :type ts_list: list of dict

    :param resampling_period: target period for resampling (in ms)
    :type resampling_period: int

    :param adding_method: Method to use for interpolation (see type AddingMethod for more information)
    :type adding_method: AddingMethod or str or int

    :param timestamp_position: timestamp position in the interval while downsampling
    :type timestamp_position: str ('BEG','MID','END')

    :param aggregation_method: aggregation method for downsampling
    :type aggregation_method: str ('MIN','MAX','MED','AVG','FIRST','LAST')

    :param nb_points_by_chunk: user defined number of points used for a spark chunk of data (after resampling)
    :type nb_points_by_chunk: int

    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
    :type generate_metadata: boolean (default : False)

    :returns: a list of dict [{'tsuid': xxx, 'funcId': yyy },...]
    :rtype: list of dict
    """

    if ts_list == []:
        return []

    fid_dict = dict()
    for ts in ts_list:
        fid_dict[ts['funcId']] = ts['tsuid']

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Extract tsuid list from inputs

    tsuid_list = [x["tsuid"] for x in ts_list]

    # Checking metadata availability before starting resampling
    meta_list = IkatsApi.md.read(tsuid_list)

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Timeseries %s : no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for resampling %s" %
                             tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata 'ikats_start_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No start date available for resampling [%s]" %
                             tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "meta data 'ikats_end_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No end date available for resampling [%s]" %
                             tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata qual_ref_period' for timeseries %s not found in base",
                tsuid)
            raise ValueError(
                "No reference period available for resampling [%s]" % tsuid)

        # Original timeseries information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original timeseries
        fid_origin = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]

        # Generate functional id for resulting timeseries
        if resampling_way == ResamplingWay.UP_SAMPLING:
            func_id = "%s_resampled_to_%sms_%s" % (
                fid_origin, str(resampling_period), str(adding_method))
        else:
            func_id = "%s_resampled_to_%sms_%s_%s" % (
                fid_origin, str(resampling_period), timestamp_position,
                aggregation_method)

        # Creating new reference in database for new timeseries
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation :

        # Computing elementary size which is the lowest common multiple between ref period and resampling period
        elementary_size = _lowest_common_multiple(ref_period,
                                                  resampling_period)

        # Seeking the number of elementary size which contains nb of points nearest to nb_points_by_chunk parameter
        # in order to compute the final data chunk size
        nb_points_for_elementary_size = int(elementary_size /
                                            resampling_period)
        data_chunk_size = int(nb_points_by_chunk /
                              nb_points_for_elementary_size) * elementary_size

        # Limit the size of data_chunk_size
        if data_chunk_size < elementary_size:
            data_chunk_size = elementary_size

        # Computing intervals for chunk definition
        interval_limits = np.hstack((np.arange(sd,
                                               ed,
                                               data_chunk_size,
                                               dtype=np.int64), ed))

        # from intervals we define chunk of data to compute
        # ex : intervals = [ 1, 2, 3] => 2 chunks [1, 2] and [2, 3]
        if len(interval_limits) > 2:
            # there is more than 2 limits for interval definition, i.e there is more than one chunk to compute
            data_to_compute.extend([(tsuid, func_id, i, interval_limits[i],
                                     interval_limits[i + 1])
                                    for i in range(len(interval_limits) - 1)])
        elif len(interval_limits) > 1:
            # only one chunk to compute
            data_to_compute.append(
                (tsuid, func_id, 0, interval_limits[0], interval_limits[1]))

        # in case last original point and last downsampled point are aligned => add a supplementary chunk to compute
        # last point
        if (interval_limits[-1] - sd) % resampling_period == 0:
            data_to_compute.append((tsuid, func_id, 1, interval_limits[-1],
                                    interval_limits[-1] + resampling_period))

    LOGGER.info("Running resampling using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    if resampling_way == ResamplingWay.UP_SAMPLING:
        spark_function = _spark_upsample
        args = adding_method
    else:
        spark_function = _spark_downsample
        args = (timestamp_position, aggregation_method)

    try:

        # OUTPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute,
                                           len(data_to_compute))

        # INPUT :  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS : read original data in database / filter chunk with no data
        rdd_data_with_chunk_index = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        if resampling_way == ResamplingWay.UP_SAMPLING:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
            # PROCESS : compute inter-chunks intervals / filter empty chunks
            rdd_data = _calc_inter_chunks(rdd=rdd_data_with_chunk_index) \
                .map(lambda x: (x[0], x[2])) \
                .filter(lambda x: not (len(x[1]) == 2 and (int(float(x[1][0][0])) == int(float(x[1][1][0])))))
        else:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array), ...]
            # PROCESS : suppress useless chunk indexes
            rdd_data = rdd_data_with_chunk_index.map(lambda x: (x[0], x[2]))

        # INPUT :  [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
        # OUTPUT : [((TSUID_origin, func_id), data_resampled_array), ...]
        # PROCESS : resample chunks of data to resampling_period
        rdd_resampled_data = rdd_data.map(
            lambda x: (x[0], spark_function(data=x[1], period=resampling_period, args=args))) \
            .filter(lambda x: len(x[1]) > 0)

        # INPUT :  [((TSUID_origin, func_id), data_resampled_array), ...]
        # OUTPUT : [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS : create resampled data in database / compute global start and end date
        identifiers = rdd_resampled_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while resampling with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop(
        )  # Post-processing : metadata import and return dict building

    # returns dict containing the results of the resampling
    # where key is the original TSUID and values are resampled TSUID and functional identifiers
    returned_dict = {}
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid,
                       md_name='qual_ref_period',
                       md_value=resampling_period,
                       data_type=DTYPE.number,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_start_date',
                       md_value=sd,
                       data_type=DTYPE.date,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_end_date',
                       md_value=ed,
                       data_type=DTYPE.date,
                       force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid,
                           name='qual_nb_points',
                           value=qual_nb_points,
                           data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        returned_dict[tsuid_origin] = {"tsuid": tsuid, 'funcId': func_id}

    return returned_dict
Beispiel #6
0
def random_projections(ts_list, sax_info, collision_info, recognition_info):
    """
    The Random Projections Algorithm
    ================================

    This algorithm does the following (detailed for 1 TS but valid for many TS):
        * Apply the sliding window
        * Normalize the TS (global or/and local)
        * Filter the linear sequences (optional) and trivial matches
        * Apply the SAX algorithm
        * Build the collision matrix
        * Find the largest value cells in the collision matrix
        * Search the motif neighborhood

        ..note::
            The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the
            results.

    **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all
    configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized.

    :param ts_list: list of TSUID
    :type ts_list: list

    :param sax_info: the information to make the sliding window and the sax_algorithm
    :type sax_info: ConfigSax

    :param collision_info: the information to build the collision matrix
    :type collision_info: ConfigCollision

    :param recognition_info: the information to made the pattern _recognition
    :type recognition_info: ConfigRecognition

    :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list
    :type: list, str, float, list
    """
    LOGGER.info("Configurations deduced from user parameters:")
    LOGGER.info("- sliding sax nb paa=%s", sax_info.paa)
    LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size)
    LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size)
    LOGGER.info("- collision nb indexes=%s", collision_info.index)
    LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations)
    LOGGER.info("- collision accepted errors=%s", collision_info.errors)
    LOGGER.info("- recognition min_value=%s", recognition_info.min_value)
    LOGGER.info("- recognition iterations=%s", recognition_info.iterations)
    LOGGER.info("- recognition similarity radius=%s", recognition_info.radius)

    # Create or get a spark Context
    LOGGER.info("Running using Spark")
    spark_ctx = ScManager.get()

    # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... }
    # OUTPUT :  rdd_sequences_list = [ (key, sequence), ... ]
    # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs)
    rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list,
                                                                         sax_info=sax_info,
                                                                         spark_ctx=spark_ctx,
                                                                         trivial_radius=recognition_info.radius / 2)
    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : rdd_sax_result is a SaxResult object containing
    #  * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated
    #  * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1)
    #  * sax_word (large str): large string of all the SAX words concatenated
    # PROCESS : Give the SAX form of the sequences
    rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD!
    # PROCESS : transform rdd_sequences_list elements into dict
    sequences_list = rdd_sequences_list.collectAsMap()

    # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD!
    # PROCESS : transform rdd_normalization_coefficients elements into dict
    normalization_coefficients = rdd_normalization_coefficients.collectAsMap()

    # Keep only necessary information of each sequence
    sequences_list = sequences_info(sequences_list, normalization_coefficients)

    # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape)
    # (usefull for past-processing the random projection algorithm).
    breakpoints = [str(i) for i in rdd_sax_result.breakpoints]

    # Build the table which give the distance between two letters (need just sax_result.breakpoints)
    mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size)

    # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa)
    rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx)

    LOGGER.info("- filtered number of words=%s", number_of_sequences)

    if number_of_sequences == 1:
        LOGGER.info("- sliding window find just one sequence, no collision matrix computed.")
        collision_matrix = SparseMatrix(np.array([[0]]))
    else:

        # Build the collision matrix, the number of iteration can change
        # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified)
        collision_matrix, collision_info.nb_iterations = final_collision_matrix(
            sax=rdd_sax,
            number_of_iterations=collision_info.nb_iterations,
            index_selected=collision_info.index,
            word_len=sax_info.paa,
            spark_ctx=spark_ctx)

    # *collision_matrix* is a sparse matrix : light in memory

    # Give the result of the Equation 9
    eq9_result = equation9(number_of_sequences=number_of_sequences,
                           size_alphabet=sax_info.alphabet_size,
                           size_word=sax_info.paa,
                           errors=collision_info.errors,
                           index_selected=collision_info.index,
                           iterations=collision_info.nb_iterations)

    sax = rdd_sax.collect()
    paa_result = np.transpose(paa_result)

    distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size,
                                       mindist_lookup_table=mindist_lookup_table,
                                       alphabet_size=sax_info.alphabet_size,
                                       sax=sax,
                                       radius=recognition_info.radius,
                                       collision_matrix=collision_matrix)

    LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result,
                collision_info.errors)

    # Check the eq9_result with min_value
    if eq9_result < recognition_info.min_value:
        LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value)
        eq9_result = recognition_info.min_value
    if eq9_result < 1:
        LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1")
        eq9_result = 1

    # find the motif neighborhood by using the largest value cells in the collision matrix
    if recognition_info.is_algo_method_global is True:
        algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info)
    else:
        algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info)

    # Give the results with the names of sequences and not their number in the collision matrix
    algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result)

    algo_result = result_on_pattern_form(algo_result)

    # Give the alphabet used in the SAX algorithm
    alphabet = start_alphabet(sax_info.alphabet_size)

    result = {'patterns': algo_result,
              'break_points': breakpoints,
              'disc_break_points': alphabet}

    if spark_ctx is not None:
        ScManager.stop()
        LOGGER.info("Ended Spark session.")

    return result
def calc_quality_stats(ts_list,
                       compute_value=True,
                       compute_time=True,
                       chunk_size=75000,
                       force_save=True):
    """
    Compute the quality statistics

    Returns a dict as follow
        {
            "TSUIDx" : {
                "MetadataX": ValueX,
                ...
            },
            ...
        }

    Don't override default chunk_size unless you know what you are doing.
    It defines the number of points in a single chunk (assuming th TS is periodic)
    Use it only for performances purposes

    :param ts_list: List of TSUID to work onto
    :type ts_list: list

    :param compute_value: boolean indicating to compute metadata related to value
    :type compute_value: bool

    :param compute_time: boolean indicating to compute metadata related to time
    :type compute_time: bool

    :param chunk_size: (Advanced usage) Override the chunk size
    :type chunk_size: int

    :param force_save: Save metadata even if already present (default True)
    :type force_save: bool

    :return: Tuple composed of the input ts list and a dict
             having TSUID as key and a value being sub-dict
             where key is metadata name
    :rtype: tuple dict
    """

    if not compute_value and not compute_time:
        LOGGER.error("You shall compute at least one set of metadata.")
        raise ValueError("You shall compute at least one set of metadata")

    try:
        # Convert tsuid_list [{tsuid:x, fid:x},...] to tsuid_list [tsuid,...]
        tsuid_list = [x['tsuid'] for x in ts_list]

    except TypeError:
        # Already a tsuid_list. No change
        tsuid_list = ts_list

    LOGGER.info('Computing Quality stats for %s TS', len(tsuid_list))

    # Get all metadata
    md_list = IkatsApi.md.read(ts_list=tsuid_list)

    # Initialize results
    results = {}
    for tsuid in tsuid_list:
        results[tsuid] = {}

    try:
        # Get Spark Context
        # Important !!!! Use only this method in Ikats to use a spark context
        spark_context = ScManager.get()

        results = {}
        for index, tsuid in enumerate(tsuid_list):

            LOGGER.info('Processing Quality stats for TS %s (%s/%s)', tsuid,
                        index, len(tsuid_list))

            # Generating information about TSUID chunks
            # ([chunk_index, sd, ed], ...)
            ts_info = []
            for chunk_index in range(
                    _ts_chunk_count(tsuid=tsuid,
                                    md_list=md_list,
                                    chunk_size=chunk_size)):
                ts_info.append(
                    _ts_chunk(tsuid=tsuid,
                              index=chunk_index,
                              md_list=md_list,
                              chunk_size=chunk_size))

            # Parallelizing information to work with spark
            # Each chunk can be computed separately, so divided into len(chunks) partitions
            rdd_ts_info = spark_context.parallelize(ts_info,
                                                    max(8, len(ts_info)))

            # RDD containing the list of points values for every chunk of a TSUID
            # (without timestamps):
            # ([chunk_index, [[timestamp, value], ...], ...)
            rdd_ts_dps = rdd_ts_info \
                .map(lambda x: (x[0], _ts_read(tsuid=tsuid, start_date=x[1], end_date=x[2])))

            # This RDD is used multiple times, caching it to speed up
            rdd_ts_dps.cache()

            if compute_value:
                # Compute metadata related to "value" information
                result = calc_qual_stats_value(tsuid,
                                               rdd_ts_dps,
                                               force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            if compute_time:
                # Compute metadata related to "time" information
                result = calc_qual_stats_time(tsuid,
                                              rdd_ts_dps,
                                              force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            # We don't need the cache anymore
            rdd_ts_dps.unpersist()
    except Exception as cause:
        raise IkatsException("Quality stats failure with ...", cause)
    finally:
        ScManager.stop()
    return ts_list, results
Beispiel #8
0
    def run(self, tsuids):
        """
        Run the Spark Distance calculation

        create the RDD for each tsuid,
        load the TS from tdm in a broadcast dictionary (ie shared by all workers)
        map the RDD with cartesian product ( ie get RDD1,RDD1 RDD1,RDD2 RDD2,RDD1 RDD2,RDD2 with 2 RDD)
        to get the comparison couples.
        then reduce applying distance function and add the result into Accumulator (ie shared by all workers)
        distance function take the two TS from broadcast dictionary, shrink the biggest and apply euclidean

        Usage: pi [tsuid1] [tsuid2] ...[tsuidn]

        example : tsuids = ['0000110000030003F30000040003F1',
                            '0000110000030003F40000040003F1',
                            '0000110000030003F50000040003F1',
                            '0000110000030003F60000040003F1',
                            '0000110000030003F70000040003F1']

        :param tsuids: a list of tsuids (str)
        :type tsuids: list

        """

        # creation of the RDD
        rdd = self.spark_context.parallelize(tsuids)

        self.logger.info("rdd parallelized")
        self.logger.info("loading TS")
        start_time = time.time()

        j = len(tsuids) // self.ts_load_split_size
        self.logger.debug(type(tsuids))
        self.logger.info("Number of TS: %i ", len(tsuids))
        ts = list()
        for i in range(0, j + 1):
            k = (i + 1) * self.ts_load_split_size
            if k > len(tsuids):
                k = len(tsuids)
            self.logger.info("extract TS from index %i to %i ",
                             i * self.ts_load_split_size, k)
            ts.extend(self.tdm.get_ts(tsuids[i * self.ts_load_split_size:k]))

        ts_dic = dict()
        self.logger.info("Number of TS loaded : %i ", len(ts))
        for index in range(0, len(tsuids)):
            ts_dic[tsuids[index]] = ts[index]
        # broadcast var used to get the map result
        broadcast_var = self.spark_context.broadcast(ts_dic)
        loading_end_time = time.time()
        self.logger.info("Loading Time : %s ", loading_end_time - start_time)

        # create the result accumulator
        list_accum = self.spark_context.accumulator(dict(),
                                                    ListAccumulatorParam())

        def calculate_distance(tsuid_list):
            """
               :param tsuid_list: a pair of tsuids
               :type tsuid_list: list
            """
            # use py4j logger to avoid Serialization problems.
            logger = logging.getLogger('py4j')
            logger.setLevel(logging.INFO)
            logger.removeHandler(logger.handlers[0])
            # sh = logging.StreamHandler(sys.stdout)
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            formatter = logging.Formatter(
                '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s')
            stream_handler.setFormatter(formatter)
            logger.addHandler(stream_handler)

            # start distance calculus.
            logger.debug("tsuid1= %s", tsuid_list[0])
            logger.debug("tsuid2= %s", tsuid_list[1])
            if tsuid_list[0] != tsuid_list[1]:
                first_ts = np.array(broadcast_var.value[tsuid_list[0]][:, 1])
                second_ts = np.array(broadcast_var.value[tsuid_list[1]][:, 1])
                calculus_len = min(len(first_ts), len(second_ts))

                distance = euclidean(first_ts[0:calculus_len],
                                     second_ts[0:calculus_len])

                # logger.debug("tsuid list %s and distance %f" % (tsuid_list, distance))
                list_accum.add({tsuid_list: distance})

        __import__('ikats.algo.core.distance')
        rdd.cartesian(rdd).foreach(calculate_distance)

        ScManager.stop()

        computation_end_time = time.time()
        self.logger.info("Loading Time : %s ", loading_end_time - start_time)
        self.logger.info("Compute Time : %s ",
                         computation_end_time - loading_end_time)
        return list_accum.value
Beispiel #9
0
def cut_y(original_ts_list, criterion, fid_pattern="{fid}_cutY{compl}", chunk_size=75000):
    """
    Algorithm Cut-Y

    Cut among Y-axis (values) a list of timeseries matching a criterion defined as a python expression.
    Matching and non-matching values are separated into 2 timeseries

    This algorithm uses spark

    From the TS list provided (used as reference), extract 2 TS list:
    * The first one matching the value condition
    * The second one not matching the value condition

    :param original_ts_list: List of TSUID/funcID to use for filtering: [{tsuid:xxx, funcId:xxx}, ...]
    :param criterion: python expression used to define a matching pattern
    :param fid_pattern: pattern used to name the FID of the output TSUID.
           {fid} will be replaced by the FID of the original TSUID FID
           {M} will be replaced by the original TSUID metric name
           {compl} will be replaced by "" or "_compl" depending on the output type (matching/not matching).
    :param chunk_size: the number of points per chunk

    :type original_ts_list: list
    :type criterion: str
    :type fid_pattern: str
    :type chunk_size: int

    :return: 2 lists representing the "matching" and "not matching" list of TS corresponding to the input
    :rtype: list

    :raises ValueError: if ts_list is badly formatted
    :raises TypeError: if ts_list is not a list
    """

    # Check input validity
    if type(original_ts_list) is not list:
        raise TypeError("ts_list shall be a list")
    if len(original_ts_list) == 0:
        raise ValueError("ts_list shall have at least one element")
    for _, item in enumerate(original_ts_list):
        if "tsuid" not in item or "funcId" not in item:
            raise ValueError("ts_list shall have tsuid and funcId defined")

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=[x['tsuid'] for x in original_ts_list])

    # Prepare the spark items to parallelize

    # Create and build the data that will be used in spark transformations
    ts_list_with_new_fid, fid2tsuid = _prepare_spark_data(fid_pattern=fid_pattern,
                                                          md_list=md_list,
                                                          ts_list=original_ts_list)
    # Chunks computation
    ts_info = []
    for ts_data in ts_list_with_new_fid:

        # Get the chunks raw information
        chunks = SparkUtils.get_chunks(tsuid=ts_data[0], md_list=md_list, chunk_size=chunk_size)

        # Build a new list containing only used information
        for chunk in chunks:
            ts_info.append({
                "tsuid": ts_data[0],
                "start_date": chunk[1],
                "end_date": chunk[2],
                "matching_fid": ts_data[1],
                "not_matching_fid": ts_data[2],
                "matching_tsuid": fid2tsuid[ts_data[1]],
                "not_matching_tsuid": fid2tsuid[ts_data[2]]
            })

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()
    try:

        # Prepare the lambda expression. Value is replaced by "Y" variable name
        lambda_criterion = eval("lambda Y : " + criterion)

        # OUTPUT : [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # PROCESS : Parallelize TS chunks information
        rdd_ts_list = spark_context.parallelize(ts_info, max(8, len(ts_info)))

        # INPUT :  [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # OUTPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # PROCESS : Separate points matching and not-matching the criterion in every chunk. Fill the corresponding TS
        rdd_imported = rdd_ts_list.map(lambda x: _spark_cut_y_chunk(
            tsuid=x['tsuid'],
            start_date=x['start_date'],
            end_date=x['end_date'],
            match_criterion=lambda_criterion,
            result_info={
                "matching_fid": x['matching_fid'],
                "not_matching_fid": x['not_matching_fid'],
                "matching_tsuid": x['matching_tsuid'],
                "not_matching_tsuid": x['not_matching_tsuid']
            }))

        # INPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # OUTPUT : [(TSUID, nb_points, start_date, end_date), ...]
        # PROCESS : Flat the results and simplify the format to allow quick actions on every item
        rdd_metadata_prep = rdd_imported \
            .flatMap(lambda x: x) \
            .filter(lambda x: x is not None) \
            .map(lambda x: (x['tsuid'], x['numberOfSuccess'], x['start_date'], x['end_date']))

        # Delete empty TSUID
        deleted_tsuid = rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] == 0) \
            .map(lambda x: (x[0], IkatsApi.ts.delete(tsuid=x[0]))) \
            .map(lambda x: x[0]) \
            .collect()

        # This RDD is reused in several branches. Caching it improves the performances
        rdd_metadata_prep.cache()

        # Create metadata qual_nb_points
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] > 0) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="qual_nb_points", value=x[1]))

        # Create metadata ikats_start_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[2])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: min(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_start_date", value=x[1]))

        # Create metadata ikats_end_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[3])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: max(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_end_date", value=x[1]))

        # Unpersist the RDD because not used anymore
        rdd_metadata_prep.unpersist()

    finally:
        ScManager.stop()

    # Inherit properties
    for item in ts_list_with_new_fid:
        if fid2tsuid[item[1]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[1]], parent=item[0])
        if fid2tsuid[item[2]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[2]], parent=item[0])

    # Format and sort the results
    # First output contains the matched data points TS reference
    # Second output contains the not matched (complement) points TS reference
    return (_format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=1),
            _format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=2))
Beispiel #10
0
def dataset_cut_spark(tsuid_list, start, end, nb_points, nb_points_by_chunk, generate_metadata, meta_list):
    """
    Cutting dataset algorithm, using spark

    :param tsuid_list: list of tsuid
    :param start: start cut date
    :param end: end cut date
    :param nb_points: number of points to cut
    :param nb_points_by_chunk: number of points per chunk
    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
                              (default: False)
    :param meta_list: dict of metadata (tsuid is the key)

    :type tsuid_list: list
    :type start: int
    :type end: int or None
    :type nb_points: int or None
    :type generate_metadata: boolean
    :param meta_list: dict

    :return: list of dict {"tsuid": tsuid, "funcId": func_id}
    :rtype: list of dict

    :raise ValueError: if inputs are not filled properly (see called methods description)
    """

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Time series %s: no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for cutting %s" % tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_start_date' for time series %s not found in base", tsuid)
            raise ValueError("No start date available for cutting [%s]" % tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_end_date' for time series %s not found in base", tsuid)
            raise ValueError("No end date available for cutting [%s]" % tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'qual_ref_period' for time series %s not found in base", tsuid)
            raise ValueError("No reference period available for cutting [%s]" % tsuid)

        # Original time series information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original time series
        fid_origin = IkatsApi.ts.fid(tsuid)

        # Generate functional id for resulting time series
        func_id = "%s_cut_%d" % (fid_origin, time.time() * 1e6)

        # Creating new reference in database for new time series
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation:

        data_chunk_size = int(nb_points_by_chunk * ref_period)

        # Computing intervals for chunk definition
        interval_limits = np.hstack(np.arange(sd, ed, data_chunk_size, dtype=np.int64))

        # from intervals we define chunk of data to compute:
        #
        # 1. defining chunks excluding last point of data within every chunk
        # ex: intervals = [ 10, 20, 30, 40 ] => 2 chunks [10, 19] and [20, 29] (last chunk added in step 2)
        data_to_compute.extend([(tsuid,
                                 func_id,
                                 i,
                                 interval_limits[i],
                                 interval_limits[i + 1] - 1) for i in range(len(interval_limits) - 1)])
        # 2. adding last interval, including last point of data
        # ex: [30, 40]
        data_to_compute.append((tsuid,
                                func_id,
                                len(interval_limits) - 1,
                                interval_limits[-1],
                                ed + 1))

    LOGGER.info("Running dataset cut using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    try:

        # OUTPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute, len(data_to_compute))

        # INPUT:  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS: read original data in database / filter chunk with no data
        rdd_data = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        # INPUT:  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
        # PROCESS: cut chunks of data, filter empty results
        rdd_cut_chunk_data = rdd_data \
            .map(lambda x: (x[0], x[1], _spark_cut(data=x[2], min_date=start, max_date=end))) \
            .filter(lambda x: len(x[2][1]) > 0) \
            .cache()

        # no end cutting date provided => case of cutting a given number of points
        if end is None:

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), [(chunk_index1, nb_points1), (chunk_index2, nb_points2),...], ...]
            # PROCESS: Collect nb points associated to chunk indexes
            ts_pts_by_chunk = rdd_cut_chunk_data.map(lambda x: (x[0], (x[1], x[2][0]))) \
                .groupByKey().map(lambda x: (x[0], list(x[1]))) \
                .collect()

            # Compute for each ts from collected data:
            #   - last chunk index containing points to keep
            #   - the number of points to keep in this last chunk
            # cut_info: {(TSUID_origin1, func_id1):(last_chunk_index1, nb_points1),
            #             (TSUID_origin2, func_id2):(last_chunk_index2, nb_points2), ...}
            cut_info = {}
            for ts in ts_pts_by_chunk:
                nb_cumul = 0
                for chunk_index, points in ts[1]:
                    nb_cumul += points
                    # noinspection PyTypeChecker
                    if nb_cumul > nb_points:
                        # noinspection PyTypeChecker
                        cut_info[ts[0]] = (chunk_index, points - (nb_cumul - nb_points))
                        break
                else:
                    LOGGER.warning(
                        "Number of points cut with start cutting date provided exceeds time series %s size"
                        % IkatsApi.ts.fid(ts[0][0]))
                    # case nb_points > nb points of the time series
                    # noinspection PyTypeChecker
                    cut_info[ts[0]] = (chunk_index, points)

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.filter(lambda x: x[1] <= cut_info[x[0]][0]) \
                .map(lambda x: (x[0], x[2][1][:cut_info[x[0]][1]] if x[1] == cut_info[x[0]][0] else x[2][1]))

        else:
            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.map(lambda x: (x[0], x[2][1]))

        # INPUT:  [((TSUID_origin, func_id), data_cut_array), ...]
        # OUTPUT: [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS: create cut data in database / compute global start and end date
        identifiers = rdd_cut_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while cutting with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop()  # Post-processing: metadata import and return dict building

    # Returns list of dict containing the results of the cut time series: TSUID and functional identifiers
    results = []
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True)
        _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        results.append({"tsuid": tsuid, "funcId": func_id})

    return results
Beispiel #11
0
def cut_ds_from_metric(ds_name,
                       metric,
                       criteria,
                       group_by=None,
                       fid_pattern=None,
                       chunk_size=75000):
    """
    Entry point of the method that cut a dataset based on the criteria applied to the TS matching the metric

    The criteria expression is a python expression that will be converted to a lambda expression with 'M' used as metric
    value.
    Example: "M > 7 and M not in [1,2,6]"

    :param ds_name: name of the dataset to use
    :param metric: metric used as reference to find cut ranges
    :param criteria: criteria expression describing the value thresholds.
    :param group_by: metadata to iterate on each value (Default to None to not use this behaviour)
    :param fid_pattern: name of the generated TS.
                        Variables can be used:
                        - {fid}   : Functional identifier
                        - {M}     : metric
    :param chunk_size: Size of the ideal chunk (in number of points per chunk)

    :type ds_name: str
    :type metric: str
    :type criteria: str
    :type group_by: str or None
    :type fid_pattern: str
    :type chunk_size: int

    :return: the ts list of the generated TS. [{"funcId": "xx", "tsuid":"xx"}]
    :rtype: list

    :raises ValueError: if dataset is empty
    :raises ValueError: if metric is found several times in dataset
    :raises ValueError: if metric is not found in dataset
    :raises ValueError: if group_by doesn't have a matching reference
    :raises KeyError: if error in fid_pattern
    """

    # List of TS present in dataset
    ts_list = IkatsApi.ds.read(ds_name=ds_name)['ts_list']

    if len(ts_list) == 0:
        LOGGER.error("Dataset %s is empty", ds_name)
        raise ValueError("Dataset %s is empty" % ds_name)

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=ts_list)

    # List of all possible values encountered for the group by
    groups_list = None
    if group_by not in [None, ""]:
        # Get all the groups for this group by criterion
        groups_list = _find_all_groups(group_by, md_list)
        LOGGER.info("%s groups found for [%s]", len(groups_list), group_by)
    else:
        # Force to None
        group_by = None

    # Find the reference TS and all TS to cut using this ref
    grouped_ts_list = _find_ts_ref_group(ds_name=ds_name,
                                         md_list=md_list,
                                         metric=metric,
                                         ts_list=ts_list,
                                         group_by=group_by,
                                         group_by_list=groups_list)

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()

    try:
        result = []

        # For each group (processed in alphabetic order)
        for group in sorted(grouped_ts_list):
            result_iter = _cut_from_metric_for_group(
                chunk_size=chunk_size,
                criteria=criteria,
                ds_name=ds_name,
                fid_pattern=fid_pattern,
                md_list=md_list,
                metric=metric,
                spark_context=spark_context,
                group=grouped_ts_list[group])

            # Sort functional identifiers alphabetically)
            result.extend(sorted(result_iter, key=lambda x: x['funcId']))

        return result
    finally:
        ScManager.stop()
Beispiel #12
0
def run_sax_from_ts_list(ts_list,
                         alphabet_size,
                         word_size,
                         normalize=False,
                         activate_spark=None):
    """
    Perform the Symbolic Aggregate Approximation (SAX) on the TSUID list provided in **ts_list**

    Use spark if necessary

    .. note::
        If spark fails. The local computation will be performed

    :param ts_list: tsuid list of the TS to calculate the PAA timeseries
    :type ts_list: list

    :param alphabet_size: number of characters in result word
    :type alphabet_size: int

    :param word_size: number of segments
    :type word_size: int

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or none

    :param normalize: Apply the normalization of the TS if True (False:default)
    :type normalize: bool

    :return: A list of dict composed of the PAA result, the SAX breakpoints, the SAX string and the points for all TSUID
    :rtype: list
    """

    results = {}

    # Define if spark is necessary
    if activate_spark is None:

        md = IkatsApi.md.read(ts_list=ts_list)
        sum_points = 0
        for tsuid in md:
            if 'qual_nb_points' in md[tsuid]:
                sum_points += float(md[tsuid]['qual_nb_points'])
            else:
                # No information on number of points, consider using spark
                sum_points = 0
                break
        spark_nb_points_trigger = 1E5
        if sum_points == 0 or sum_points / len(
                ts_list) > spark_nb_points_trigger:
            # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points
            activate_spark = True

    if activate_spark:
        LOGGER.info("Running SAX using Spark")

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Build the RDD with TSUIDS
        rdd = spark_context.parallelize(ts_list)

        # Create a broadcast for spark jobs
        broadcast = spark_context.broadcast({
            "alphabet_size": alphabet_size,
            "word_size": word_size,
            "normalize": normalize,
        })

        # Create an accumulator to store the results of the spark workers
        accumulator = spark_context.accumulator(dict(), ListAccumulatorParam())

        def run_sax_spark(working_tsuid):
            """
            Method called by spark job

            :param working_tsuid: rdd item
            """

            results = run_sax_from_tsuid(
                tsuid=working_tsuid,
                alphabet_size=broadcast.value['alphabet_size'],
                word_size=broadcast.value['word_size'],
                normalize=broadcast.value['normalize'])

            accumulator.add({working_tsuid: results})

        # Get TS content using spark distribution to increase performance
        # noinspection PyBroadException
        try:
            rdd.foreach(run_sax_spark)
        except Exception:
            LOGGER.warning(
                'Something wrong with spark, Using Local Computation')
            activate_spark = False

        for ts in ts_list:
            if ts in accumulator.value:
                results[ts] = accumulator.value[ts]
            else:
                LOGGER.warning(
                    "TS %s has encountered an issue during the spark distribution",
                    ts)

        ScManager.stop()

    if not activate_spark:
        LOGGER.info("Running SAX on single instance")

        for ts in ts_list:
            results[ts] = run_sax_from_tsuid(tsuid=ts,
                                             alphabet_size=alphabet_size,
                                             word_size=word_size,
                                             normalize=normalize)

            # print("TS=%s\nnorm=%s\nr=%s\n\n"%(ts,normalize,results[ts]['sax_breakpoints'][0]))

    return results
Beispiel #13
0
def run_paa_from_ts_list(tdm,
                         ts_list,
                         paa_size,
                         out_ts=True,
                         save=False,
                         activate_spark=None):
    """
    Compute the Piecewise Aggregation Approximation (PAA) on the **ts_list** provided
    Use spark if necessary

    :param tdm: temporal data manager object
    :type tdm: TemporalDataMgr

    :param ts_list: tsuid list of the TS to calculate the PAA timeseries
    :type ts_list: list

    :param paa_size: number of segments
    :type paa_size: int

    :param out_ts: True means the result will be a TS, False will return only the means
    :type out_ts: bool

    :param save: True means the new TS will be saved in addition of the return
    :type save: bool

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or None

    :return: the array of the new TS resulting of the PAA approximation or the list of values (with len = paa_size)
    :rtype: list
    """

    results = {}

    # Define if spark is necessary
    if activate_spark is None:

        md = tdm.get_meta_data(ts_list)
        sum_points = 0
        for tsuid in md:
            if 'qual_nb_points' in md[tsuid]:
                sum_points += float(md[tsuid]['qual_nb_points'])
            else:
                # No information on number of points, consider using spark
                sum_points = 0
                break
        spark_nb_points_trigger = 1E5
        if sum_points == 0 or sum_points / len(
                ts_list) > spark_nb_points_trigger:
            # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points
            activate_spark = True

    if activate_spark:
        LOGGER.info("Running PAA using Spark")

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Build the RDD with TSUIDS
        rdd = spark_context.parallelize(ts_list)

        # Create a broadcast for spark jobs
        broadcast = spark_context.broadcast({
            "host": tdm.host,
            "port": tdm.port,
            "paa_size": paa_size,
            "out_ts": out_ts,
            "save": save,
        })

        # Create an accumulator to store the results of the spark workers
        accumulator = spark_context.accumulator(dict(), ListAccumulatorParam())

        def run_paa_spark(working_tsuid):
            """
            Method called by spark job

            :param working_tsuid: rdd item
            """

            spark_tdm = TemporalDataMgr(host=broadcast.value['host'],
                                        port=broadcast.value['port'])

            # noinspection PyBroadException
            try:
                results = run_paa_from_tsuid(
                    tdm=spark_tdm,
                    tsuid=working_tsuid,
                    paa_size=broadcast.value['paa_size'],
                    out_ts=broadcast.value['out_ts'],
                    save=broadcast.value['save'])[:]
            except Exception:
                results = []

            accumulator.add({working_tsuid: results})

        # Get TS content using spark distribution to increase performance
        # noinspection PyBroadException
        try:
            rdd.foreach(run_paa_spark)
        except Exception:
            LOGGER.warning(
                'Something wrong with spark, Using Local Computation')
            activate_spark = False

        for ts in ts_list:
            if ts in accumulator.value:
                results[ts] = accumulator.value[ts]
            else:
                LOGGER.warning(
                    "TS %s has encountered an issue during the spark distribution",
                    ts)

        ScManager.stop()

    if not activate_spark:
        LOGGER.info("Running PAA on single instance")
        for ts in ts_list:
            results[ts] = run_paa_from_tsuid(tdm=tdm,
                                             tsuid=ts,
                                             paa_size=paa_size,
                                             out_ts=out_ts,
                                             save=save)

    return results
Beispiel #14
0
def discretize_dataset(ds_name,
                       nb_buckets,
                       table_name,
                       operators_list=None,
                       nb_points_by_chunk=100000):
    """
    This function discretizes each time series provided through dataset name input:
    1. Interval between start date and end date of each time series is divided into nb_buckets interval of same size.
    2. Each operator from input list is processed on each bucket previously defined
    3. result is formatted as a table whose entries are :
                - each time series processed in rows
                - combinations of (each operator) X (each bucket number) in columns
    Result table contains also buckets definitions like (bucket_number, start date, end date)

    :param ds_name: name of the dataset processed
    :type ds_name:  str

    :param nb_buckets: number of buckets wanted for each time series of dataset
    :type nb_buckets: int

    :param table_name: name of the table
    :type table_name:  str

    :param operators_list: list of operators to be calculated on buckets from Operators class (see above)
    :type operators_list: list

    :param nb_points_by_chunk: size of chunks in number of points (assuming time series are periodic and without holes)
    :type nb_points_by_chunk: int


    :returns: a dict containing all data awaited by functional ikats type table
    :rtype: dict

    :raises TypeError: if ds_name is not a string or is None
    :raises TypeError: if nb_buckets is not an integer
    :raises ValueError: if nb_buckets is zero
    :raises ValueError: if operators_list is None
    :raises ValueError: if operators_list items are not in Operators class
    :raises TypeError: if operators_list items are not all string
    :raises ValueError: if number of buckets exceeds number of points for one time series
    """

    # Check inputs validity
    if ds_name is None or type(ds_name) is not str:
        raise TypeError('valid dataset name must be defined (got %s, type: %s)' % (ds_name, type(ds_name)))
    try:
        nb_buckets = int(nb_buckets)
    except:
        raise TypeError('Number of buckets must be an integer (got value %s)' % nb_buckets)
    if nb_buckets == 0:
        raise ValueError("Number of buckets must be not null")
    if operators_list is None:
        raise ValueError("operators list must be not null")
    elif type(operators_list) is not list:
        raise ValueError("operators list must be a list")
    elif not operators_list:
        raise ValueError("operators list must not be empty list")
    if table_name is None or re.match('^[a-zA-Z0-9-_]+$', table_name) is None:
        raise ValueError("Error in table name")

    # Check content of operators list provided
    for operator in operators_list:
        if type(operator) is not str:
            raise TypeError('Operator must be a string (got %s)' % (type(operator)))
        if operator not in [op.name for op in Operators]:
            raise ValueError("Operators (string) must be in the following values list : %s"
                             % [op.name for op in Operators])

    # Extract tsuid list from inputs
    tsuid_list = IkatsApi.ds.read(ds_name)['ts_list']

    # Get list of metadata for all TS
    meta_dict = IkatsApi.md.read(tsuid_list)

    # Initialize result
    result = {}

    try:
        LOGGER.info("Running discretization using Spark")
        # Create or get a spark Context
        sc = ScManager.get()

        # running discretization time series by time series
        for index, tsuid in enumerate(tsuid_list):
            result[tsuid] = {}
            LOGGER.info('Processing Discretization for TS %s (%s/%s)', tsuid, index + 1, len(tsuid_list))

            sd = int(meta_dict[tsuid]['ikats_start_date'])
            ed = int(meta_dict[tsuid]['ikats_end_date'])
            nb_points = int(meta_dict[tsuid]['qual_nb_points'])

            # using qual_ref_period if defined, extrapolating otherwise
            if 'qual_ref_period' in meta_dict[tsuid]:
                period = int(float(meta_dict[tsuid]['qual_ref_period']))
            else:
                period = int(float((ed - sd) / nb_points))

            # checking buckets size regarding time series size
            if nb_buckets > nb_points:
                msg = "Number of buckets exceeds number of points for ts (%s, %s)" % (tsuid, IkatsApi.ts.fid(tsuid))
                LOGGER.error(msg)
                raise ValueError(msg)

            # definition of buckets size in ms
            bucket_size_ms = ceil((ed - sd) / nb_buckets)

            # definition of spark chunks size in ms
            chunks_size_ms = nb_points_by_chunk * period

            # definition of buckets start/end dates
            buckets_timestamps = np.hstack((np.arange(sd, ed, bucket_size_ms, dtype=int), ed))
            buckets = [(buckets_timestamps[i] + 1, buckets_timestamps[i + 1]) for i in
                       range(len(buckets_timestamps) - 1)]

            # start date of first bucket is decreased of 1 ms to catch first time series value
            buckets[0] = (buckets[0][0] - 1, buckets[0][1])

            # add bucket number
            data_to_compute = [(a, b[0], b[1]) for a, b in enumerate(buckets)]

            # store buckets definition in results
            result[tsuid]['buckets'] = data_to_compute

            # starting spark process
            # OUTPUT : [(nb_bucket, sd_bucket, ed_bucket), ...]
            inputs = sc.parallelize(data_to_compute, len(data_to_compute))

            # INPUT :  [(nb_bucket, sd_bucket, ed_bucket), ...]
            # OUTPUT : [(nb_bucket, sd_chunk, ed_chunk), ...]
            # PROCESS : cut buckets into chunks of data if smaller and repartition rdd
            rdd_chunks_timestamps = inputs \
                .flatMap(lambda x: (_spark_chunk(x[0], x[1], x[2], chunks_size_ms)))

            # INPUT : [(nb_bucket, sd_chunk, ed_chunk), ...]
            # OUTPUT : [(nb_bucket, data_array), ...]
            # PROCESS : extract data within buckets
            rdd_chunks_data = rdd_chunks_timestamps \
                .map(lambda x: (x[0], IkatsApi.ts.read(tsuid_list=[tsuid], sd=int(x[1]), ed=int(x[2]))[0])) \
                .filter(lambda x: len(x[1]) > 0)

            # INPUT : [(nb_bucket, data_array), ...]
            # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...]
            # PROCESS : calculate operators on data chunks
            rdd_chunks_calc = rdd_chunks_data \
                .map(lambda x: _spark_calc_op_on_chunks(x[0], x[1], operators_list)) \
                .filter(lambda x: x is not None)

            # INPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...]
            # OUTPUT : [(nb_bucket, {info1: , info2:, ..., infon:}),...] reduced by number of bucket
            # PROCESS : reduce operators results on data buckets
            result_by_bucket = rdd_chunks_calc.reduceByKey(lambda x, y: _spark_reduce_op_chunk(x, y)).collect()

            # extract and calculate final results by bucket
            for bucket in result_by_bucket:
                bucket_nb = bucket[0]
                infos = bucket[1]

                result[tsuid][bucket_nb] = {}
                for operator in operators_list:
                    if operator == 'MIN':
                        result[tsuid][bucket_nb]['MIN'] = float(infos["MIN"])
                    if operator == 'MAX':
                        result[tsuid][bucket_nb]['MAX'] = float(infos["MAX"])
                    if operator == 'AVG':
                        # Computation of the final mean
                        avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"])
                        result[tsuid][bucket_nb]['AVG'] = avg_value
                    if operator == 'STD':
                        # Computation of the final mean and standard deviation
                        avg_value = float(infos["SUM"]) / float(infos["NB_POINTS"])
                        # variance is caped to 0 because it could be negative
                        # (but very near to zero) due to substraction of
                        # very near floating point values
                        variance = max(float(float(infos["SQR_SUM"]) / int(infos["NB_POINTS"]) - avg_value ** 2), 0)
                        std_deviation = variance ** 0.5
                        result[tsuid][bucket_nb]['STD'] = std_deviation

        # format result to fit to table type

        description = "Result of Discretize operator with %s buckets for %s" % (nb_buckets, operators_list)
        table = _fill_table_structure_to_store(json_result=result,
                                               operators_list=operators_list,
                                               nb_buckets=nb_buckets,
                                               tsuid_list=tsuid_list,
                                               table_name=table_name,
                                               table_desc=description)

        # Save the table
        IkatsApi.table.create(data=dict(table))
    except Exception as error:
        msg = "Exception raised while discretizing with Spark"
        LOGGER.error(msg + ": %s " % error)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop()

    # Return the name of the table saved
    return table_name
Beispiel #15
0
def recognition(dist_info, largest_values, option, activate_spark):
    """
    Sparkified Search all sequences which are similar to two sequences given by the largest value in the collision
    matrix.
    To do that, we have two different methods, selected by choice argument:
      1. using the brute force (option == OPT_USING_BRUTE_FORCE)
      2. using heuristic with collision matrix (option == OPT_USING_COLLISIONS)

    :param dist_info: information used to calculate the distance between two sequences
    :type dist_info: NeighborhoodSearch

    :param largest_values: list of list of rows and columns that correspond to the largest values found in the
    collision matrix
    :type largest_values: list

    :param option: the choice of the method
    :type option: int (in OPTIONS_RECOG)

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or none

    :return: list of list of numbers that correspond to sequences
    :rtype: list

    :raise exception: IkatsException when an error occurred while processing the collision matrix
    """

    # spark_ctx is inspected in block finally: this is only initialized when algo run under spark context
    spark_ctx = None
    try:

        if type(largest_values) is not list:
            msg = "Unexpected type : list expected for list_largest_value={}"
            raise IkatsException(msg.format(largest_values))

        if type(dist_info.radius) not in [float, int] or dist_info.radius <= 0:
            msg = "Unexpected arg value : positive float expected for radius={}"
            raise IkatsException(msg.format(dist_info.radius))

        if type(option) is not int or option not in OPTIONS_RECOG:
            msg = "Unexpected arg value : positive integer within {} expected for option={}"
            raise IkatsException(msg.format(OPTIONS_RECOG, option))

        if type(activate_spark) is not bool and activate_spark is not None:
            msg = "Unexpected type : booleen type or None expected for activate_spark={}"
            raise IkatsException(msg.format(activate_spark))

        LOGGER.info("Starting recognition ...")
        if activate_spark is None:
            activate_spark = len(dist_info.collision_matrix.data
                                 ) > 1000 or len(largest_values) > 100

        if len(largest_values) == 0:
            return []

        else:

            if activate_spark is True:
                LOGGER.info("Running using Spark")

                # Create or get a spark Context
                spark_ctx = ScManager.get()

                # Build the RDD with the index list of the largest values
                rdd = spark_ctx.parallelize(largest_values)

                alphabet = _start_alphabet(dist_info.alphabet_size)

                def brute_force_spark(dist_info, largest_values, alphabet):
                    """
                    For details, see the brute force neighborhood algorithm
                    """
                    largest_1 = largest_values[0]
                    largest_2 = largest_values[1]
                    if _get_mindist(dist_info.size_sequence,
                                    dist_info.sax[largest_1],
                                    dist_info.sax[largest_2],
                                    dist_info.mindist_lookup_table,
                                    alphabet) < dist_info.radius:
                        similar_sequences = [largest_1, largest_2]

                        for j in range(len(dist_info.sax)):
                            if j != largest_1 and j != largest_2:
                                dist_val_0 = _get_mindist(
                                    dist_info.size_sequence,
                                    dist_info.sax[largest_1], dist_info.sax[j],
                                    dist_info.mindist_lookup_table, alphabet)
                                if dist_val_0 >= dist_info.radius:
                                    dist_val_1 = _get_mindist(
                                        dist_info.size_sequence,
                                        dist_info.sax[largest_2],
                                        dist_info.sax[j],
                                        dist_info.mindist_lookup_table,
                                        alphabet)
                                    if dist_val_1 < dist_info.radius:
                                        similar_sequences.append(j)
                                else:
                                    similar_sequences.append(j)

                    else:
                        similar_sequences = None
                    return similar_sequences

                def neighborhood_spark(dist_info, largest_values, alphabet):
                    """
                    For details, see the _neighborhood_option algorithm.
                    """
                    largest_1 = largest_values[0]
                    largest_2 = largest_values[1]
                    if _get_mindist(dist_info.size_sequence,
                                    dist_info.sax[largest_1],
                                    dist_info.sax[largest_2],
                                    dist_info.mindist_lookup_table,
                                    alphabet) < dist_info.radius:
                        similar_sequences_0 = [largest_1, largest_2]
                        similar_sequences_1 = [largest_1, largest_2]

                        column_0 = dist_info.collision_matrix.get_column(
                            largest_1)
                        column_1 = dist_info.collision_matrix.get_column(
                            largest_2)

                        for i in column_0:
                            if _get_mindist(dist_info.size_sequence,
                                            dist_info.sax[largest_1],
                                            dist_info.sax[i],
                                            dist_info.mindist_lookup_table,
                                            alphabet) < dist_info.radius:
                                similar_sequences_0.append(i)

                        for i in column_1:
                            if _get_mindist(dist_info.size_sequence,
                                            dist_info.sax[largest_2],
                                            dist_info.sax[i],
                                            dist_info.mindist_lookup_table,
                                            alphabet) < dist_info.radius:
                                similar_sequences_1.append(i)

                        similar_sequences = list(
                            set(similar_sequences_0)
                            | set(similar_sequences_1))

                    else:
                        similar_sequences = None
                    return similar_sequences

                if option == OPT_USING_BRUTE_FORCE:
                    LOGGER.info("- Running using the brute force")
                    rdd = rdd.map(
                        lambda x: brute_force_spark(dist_info=dist_info,
                                                    largest_values=x,
                                                    alphabet=alphabet))

                if option == OPT_USING_COLLISIONS:
                    LOGGER.info(
                        "- Running using the neighborhood the collision matrix"
                    )
                    rdd = rdd.map(
                        lambda x: neighborhood_spark(dist_info=dist_info,
                                                     largest_values=x,
                                                     alphabet=alphabet))

                collected_data = rdd.filter(lambda x: x is not None).collect()
                LOGGER.info("... ended recognition.")
                return collected_data

            else:
                LOGGER.info("Running without using Spark")
                result = _recognition(dist_info, largest_values, option)
                LOGGER.info("... ended recognition.")
                return result

    except Exception:
        LOGGER.error("... ended recognition with error.")
        raise IkatsException("Failed execution: _recognition()")

    finally:
        if spark_ctx is not None:
            ScManager.stop()