def check_labels(els_file_name, labels_file_name):

    # Check if the labels file exists.
    if not os.path.exists(labels_file_name):
        return False

    # Load labels.
    with open(labels_file_name, 'r') as labels_file_object:
        labels = yaml.safe_load(labels_file_object)
        crossings = labels['change_points']

    # Convert to float (unit days).
    crossing_floats = datestring_to_float(crossings)

    # Check if atleast one label is valid.
    atleast_one_valid_label = False
    for crossing_float in crossing_floats:
        try:
            times = get_ELS_data(els_file_name, quantity='anode5', start_time=datetime.min, end_time=datetime.max)[2]
            if times[0] <= crossing_float <= times[-1]:
                atleast_one_valid_label = True
                break

        except ValueError:
            pass

    return atleast_one_valid_label
def compute_pca_components(BLUR_SIGMA, BIN_SELECTION, FILTER, FILTER_SIZE,
                           TRANSFORM, **kwargs):

    # We compute PCA components using all the training data.
    file_list = list_of_ELS_files(DATA_DIR, CROSSINGS_DIR + 'new_labels/all/',
                                  MODE.replace('test', 'train'))

    # Append all ELS data one-by-one.
    all_counts = None
    for data_file in file_list:
        file_full = DATA_DIR + data_file
        counts, energy_range, times = get_ELS_data(file_full,
                                                   'anode5',
                                                   datetime.min,
                                                   datetime.max,
                                                   blur_sigma=BLUR_SIGMA,
                                                   bin_selection=BIN_SELECTION,
                                                   filter=FILTER,
                                                   filter_size=FILTER_SIZE)

        if all_counts is None:
            all_counts = counts
        else:
            all_counts = np.append(all_counts, counts, axis=0)

    # Apply transformation.
    all_counts = Transformation(TRANSFORM).transform(all_counts)

    # Learn PCA components from data.
    pca = PCA(n_components=10)
    pca.fit(all_counts)

    # We compute PCA components for these dimensions only.
    return {
        n_components: pca.components_[:n_components]
        for n_components in [1, 2, 5, 10]
    }
Esempio n. 3
0
def main(els_data_file, quantity, start_time, end_time, output_file, **kwargs):

    # Check input arguments - start and end times should be valid.
    if start_time is not None:
        try:
            start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M')
        except ValueError:
            raise
    else:
        start_time = datetime.min

    if end_time is not None:
        try:
            end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(
                second=59, microsecond=999999)
        except ValueError:
            raise
    else:
        end_time = datetime.max

    # Get data.
    data = get_ELS_data(els_data_file, quantity, start_time, end_time,
                        **kwargs)

    # Compute scores.
    times, scores = baseline_analysis(data)

    # Plot ELS data.
    print 'Plotting...'
    fig, axs = plt.subplots(nrows=2, sharex=True)
    plot_interpolated_ELS_data(fig,
                               axs[0],
                               els_data_file,
                               quantity,
                               start_time,
                               end_time,
                               colorbar_range='subset',
                               colorbar_orientation='horizontal',
                               **kwargs)
    axs[0].set_xlabel('')

    # Plot scores.
    axs[1].plot(times, scores)
    axs[1].set_xlabel('Date/Time')
    axs[1].set_ylabel('Change-point Score')
    axs[1].xaxis.set_tick_params(labelsize=8)
    axs[1].margins(0, 0)
    plt.setp(axs[1].get_xticklabels(), rotation=30, ha='right')

    # Place title below.
    if kwargs['filter'] is None:
        title_subtext = 'Blur %d' % (kwargs['blur_sigma'])
    else:
        title_subtext = 'Blur %d, Filter %s of Size %d' % (
            kwargs['blur_sigma'], kwargs['filter'], kwargs['filter_size'])
    fig.text(s='Change-point Scores for ELS Data \n %s' % title_subtext,
             x=0.5,
             y=0.03,
             horizontalalignment='center',
             fontsize=13)

    plt.subplots_adjust(bottom=0.3, left=0.2)

    # Save plot.
    if output_file is None:
        plt.show()
    else:
        plt.savefig(output_file, bbox_inches='tight')
Esempio n. 4
0
def main(els_data_file, output_file, quantity, start_time, end_time, run_tests,
         plot_processed_sequence, window_size, discord_dimensions,
         num_pca_components, ignored_dimensions, required_dimensions,
         std_noise):

    # Check input arguments - start and end times should be valid.
    if start_time is not None:
        try:
            start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M')
        except ValueError:
            raise
    else:
        start_time = datetime.min

    if end_time is not None:
        try:
            end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(
                second=59, microsecond=999999)
        except ValueError:
            raise
    else:
        end_time = datetime.max

    # Run doctests.
    if run_tests:
        import doctest
        import data_utils
        doctest.testmod(data_utils,
                        verbose=True,
                        optionflags=doctest.NORMALIZE_WHITESPACE)
        doctest.testmod(verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE)

    # Set random seed for reproducibility.
    random_seed = 7
    np.random.seed(random_seed)

    # Load data from ELS DAT file.
    ELS_data = get_ELS_data(els_data_file, quantity, start_time, end_time)

    # Get matrix profile, padded to match length of the original sequence.
    times, profile = matrix_profile(
        ELS_data,
        window_size,
        discord_dimensions,
        std_noise,
        num_pca_components=num_pca_components,
        ignored_dimensions=ignored_dimensions,
        required_dimensions=required_dimensions,
        plot_processed_sequence=plot_processed_sequence,
        verbose=True)

    # Plot change-point scores over windows as well as the original data.
    print 'Plotting...'

    fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True)

    plot_raw_ELS_data(fig,
                      ax0,
                      els_data_file,
                      quantity,
                      start_time,
                      end_time,
                      colorbar_range='subset',
                      colorbar_orientation='horizontal')

    ax1.plot(times, profile)
    ax1.set_ylabel('Non-Self NN-Distance')
    ax1.xaxis.set_tick_params(labelsize=8)

    # Add only supplied parameters to title.
    parameter_strings = [
        'Window Size = %d', 'Dimensions = %d', 'PCA Components = %d',
        'Noise Correction = %0.2f'
    ]
    parameters = [
        window_size, discord_dimensions, num_pca_components, std_noise
    ]
    parameter_string = ', '.join([
        parameter_string % parameter
        for parameter, parameter_string in zip(parameters, parameter_strings)
        if parameter is not None
    ])
    title = 'Matrix Profile on CAPS ELS \n %s' % parameter_string

    # Place title below.
    fig.text(s=title, y=0.03, x=0.5, horizontalalignment='center', fontsize=13)

    plt.subplots_adjust(bottom=0.3, left=0.2)

    # Save plot.
    if output_file is None:
        plt.show()
    else:
        plt.savefig(output_file, bbox_inches='tight')
def main(data_file, results_file, algorithm_name, dataset, quantity, start_time, end_time, anomaly_type, blur_sigma, bin_selection, filter, filter_size, **kwargs):

    # Seed for reproducibility.
    np.random.seed(7)

    # Create the directory for the results file.
    results_directory = os.path.dirname(results_file)
    if results_directory != '' and not os.path.exists(results_directory):
        Path(results_directory).mkdir(parents=True, exist_ok=True)

    # Check input arguments - start and end times should be valid.
    if start_time is not None:
        try:
            start_time_dt = datetime.strptime(start_time, '%d-%m-%Y/%H:%M')
        except ValueError:
            raise
    else:
        start_time_dt = datetime.min

    if end_time is not None:
        try:
            end_time_dt = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(second=59, microsecond=999999)
        except ValueError:
            raise
    else:
        end_time_dt = datetime.max

    # The time-series data loaded.
    if dataset == 'els':
        print 'ELS Bin Selection:', bin_selection
        data = get_ELS_data(data_file, quantity, start_time_dt, end_time_dt, blur_sigma, bin_selection, filter, filter_size)
    elif dataset == 'intel':
        data = get_Intel_data(data_file, 'temperature', start_time_dt, end_time_dt, downsample_rate='20min', drop_sensors=[5, 15, 18])
    else:
        raise ValueError('Invalid dataset.')

    # Update start and end times, to match the data.
    _, _, datatimes = data
    start_time = datatimes[0]
    end_time = datatimes[-1]

    print 'Data start time:', float_to_datestring(start_time)
    print 'Data end time:',  float_to_datestring(end_time)

    # The function representing the entry point for the algorithm.
    func = algorithm_name_map[algorithm_name]

    # The list of arguments this function expects.
    # We exclude the first argument, which will be the input data.
    args = signature(func).parameters.items()[1:]

    # Arguments (parameters) being passed from the command-line.
    # Each argument should have a default value in the original function, or be set from the command-line here.
    for arg, val in args:
        if val.default is Parameter.empty and kwargs.get(arg) is None:
            raise ValueError('Please pass a value for argument %s.' % arg)

    parameters = {arg: kwargs.get(arg) for arg, val in args if kwargs.get(arg) is not None}

    # Add the anomaly type as a parameter.
    # parameters['anomaly_type'] = anomaly_type

    # Call function with arguments!
    print 'Evaluating function with given parameters...'
    results, time_taken = timed(func)(data, **parameters)
    times, scores = results
    print 'Function evaluation complete!'

    # Save results (and metadata) to file.
    data_dict = {
        'times': times, 
        'scores': scores, 
        'time_taken': time_taken, 
        'anomaly_type': anomaly_type,
        'quantity': quantity, 
        'start_time': start_time,
        'end_time': end_time,
        'data_file': data_file,
        'dataset': dataset,
        'algorithm_name': formatted_algorithm_names[algorithm_name],
        'parameters': format_dict(parameters),
        'blur_sigma': blur_sigma,
        'bin_selection': bin_selection,
        'filter': filter,
        'filter_size': filter_size,
    }

    with h5py.File(results_file, 'w') as f:
        for key, val in data_dict.items():
            f.create_dataset(key, data=val)

    print('Results saved to %s. Use evaluate_methods.py/evaluate_methods_time_tolerance.py to analyze results.' % results_file)
def main(els_data_file, output_file, quantity, start_time, end_time, hmm_type,
         num_states, num_pca_components, show_information_curves,
         visualize_states, stickiness, alpha, gamma, mixture_model, **kwargs):

    # Random seed for reproducibility.
    random_seed = 7
    np.random.seed(random_seed)

    # Check input arguments - start and end times should be valid.
    if start_time is not None:
        try:
            start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M')
        except ValueError:
            raise
    else:
        start_time = datetime.min

    if end_time is not None:
        try:
            end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(
                second=59, microsecond=999999)
        except ValueError:
            raise
    else:
        end_time = datetime.max

    # Get data, and unpack.
    counts, energy_ranges, times = get_ELS_data(els_data_file, quantity,
                                                start_time, end_time, **kwargs)

    # Segment with an HMM.
    model, states, log_likelihood, states_dist, scores = \
        hmm_analysis((counts, energy_ranges, times),
                     hmm_type=hmm_type,
                     num_states=num_states,
                     num_pca_components=num_pca_components,
                     stickiness=stickiness,
                     alpha=alpha,
                     gamma=gamma,
                     mixture_model=mixture_model,
                     verbose=True, evaluation=False)

    # Intervals of states to plot.
    states_dict = array_to_intervals(states)
    print '%d HMM states used to segment the time-series.' % len(states_dict)

    # Print individual state durations.
    for state, intervals in states_dict.iteritems():
        print 'State %d durations: %s timesteps.' % (
            state, np.sum(
                [interval[1] - interval[0] for interval in intervals]))

    # Plot ELS data on top.
    fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, sharex=True)
    plot_interpolated_ELS_data(fig,
                               ax0,
                               els_data_file,
                               quantity,
                               start_time,
                               end_time,
                               colorbar_range='subset',
                               colorbar_orientation='horizontal',
                               **kwargs)
    ax0.set_xlabel('')

    # Plot segments in different colours. We don't want to repeat colors across states.
    colors = cm.get_cmap('Set3')
    labelled_states = set()
    for index, (state, intervals) in enumerate(states_dict.iteritems()):
        for interval in intervals:
            if state in labelled_states:
                ax1.axvspan(times[interval[0]],
                            times[interval[1]],
                            color=colors(index))
            else:
                ax1.axvspan(times[interval[0]],
                            times[interval[1]],
                            color=colors(index),
                            label=state)
                labelled_states.add(state)

    ax1.margins(x=0)
    ax1.set_yticks([])
    ax1.set_ylabel('HMM States', labelpad=10)
    ax1.legend(title='HMM States',
               bbox_to_anchor=(1.28, 0.5),
               loc='center right',
               fontsize=8)

    # Plot scores.
    ax2.plot(times, scores)
    ax2.set_ylabel('Scores')
    ax2.xaxis.set_tick_params(labelsize=8)
    ax2.set_xlabel('Datetime')
    plt.setp(ax2.get_xticklabels(), rotation=30, ha='right')

    # Place title below.
    formatted_name = {
        'vanilla': 'Vanilla',
        'hdp': 'HDP',
        'stickyhdp': 'Sticky HDP',
    }

    hmm_parameters = {
        'vanilla': ['num_states', 'num_pca_components'],
        'hdp': ['num_states', 'num_pca_components', 'alpha', 'gamma'],
        'stickyhdp':
        ['num_states', 'num_pca_components', 'alpha', 'gamma', 'stickiness'],
    }

    parameter_strings = {
        'num_states': 'HMM States = %d',
        'num_pca_components': 'PCA Components = %d',
        'stickiness': 'Stickiness = %0.2f',
        'alpha': 'Dirichlet Alpha = %0.2f',
        'gamma': 'Dirichlet Gamma = %0.2f',
    }

    # Add only supplied parameters to title.
    parameters = hmm_parameters[hmm_type]
    parameter_string_all = ', '.join([
        parameter_strings[parameter] % locals()[parameter]
        for parameter in parameters
    ])
    title = 'CAPS ELS Segmentation with %s-HMM \n %s \n Log-Likelihood = %0.2f' \
            % (formatted_name[hmm_type], parameter_string_all, log_likelihood)
    fig.text(s=title, x=0.5, y=0.03, horizontalalignment='center', fontsize=13)

    plt.subplots_adjust(bottom=0.4, left=0.1, right=0.8)

    # Save plot.
    if output_file is None:
        plt.show()
    else:
        plt.savefig(output_file, bbox_inches='tight')

    # Visualization of individual states.
    if visualize_states:

        # Get state-wise emission parameters, to compute distances between states.
        states_map = {index: state for index, state in enumerate(states_dict)}
        num_actual_states = len(states_map)
        all_emission_params = {
            index: model.emission_params(state)
            for index, state in enumerate(states_dict)
        }
        dissimilarities = np.zeros((num_actual_states, num_actual_states))
        for (index1, params1), (index2, params2) in itertools.product(
                all_emission_params.iteritems(),
                all_emission_params.iteritems()):
            dissimilarities[index1][index2] = kl_divergence_normals(params1['mu'], params1['sigma'], params2['mu'], params2['sigma']) \
                                              + kl_divergence_normals(params2['mu'], params2['sigma'], params1['mu'], params1['sigma'])

        transformed_states = MDS(
            dissimilarity='precomputed').fit_transform(dissimilarities)
        spacing = 5 / np.max(transformed_states)
        colors = cm.get_cmap('Set3')
        for index, _ in enumerate(transformed_states):
            plt.scatter(transformed_states[index, 0],
                        transformed_states[index, 1],
                        label=states_map[index],
                        color=colors(index))
            plt.text(transformed_states[index, 0],
                     transformed_states[index, 1] + spacing,
                     states_map[index],
                     fontsize=8)
        plt.title('MDS Plot of HMM States')
        plt.show()

        # Recreate PCA object.
        apply_pca = num_pca_components is not None and num_pca_components > 0
        if apply_pca:
            pca = PCA(n_components=num_pca_components)
            pca.fit(counts)
            data_mean = np.mean(counts, axis=0)

        # Samples hidden states to visualize them.
        num_samples = 50

        samples_dict = {}
        for state in states_dict:
            # Sample from the learned observation distribution for each state.
            samples_dict[state] = model.generate_samples(state, num_samples)

            # If we applied PCA, project back into the original number of dimensions.
            if apply_pca:
                samples_dict[state] = reconstruct_from_PCA(
                    samples_dict[state], data_mean, pca)

        # Set minimum and maximum values across plots for consistency.
        samples_list = list(samples_dict.itervalues())
        vmin = np.percentile(samples_list, 5)
        vmax = np.percentile(samples_list, 95)

        # Plot the projected samples!
        for state in states_dict:
            plt.ylabel('Energies')
            plt.imshow(samples_dict[state].T,
                       extent=[0, num_samples, 0,
                               len(energy_ranges[0])],
                       origin='upper',
                       interpolation='none',
                       vmin=vmin,
                       vmax=vmax)
            plt.yticks(np.arange(0, len(energy_ranges[0]), 5),
                       energy_ranges[0][::5])
            plt.title('HMM State %d \n Transformed Samples' % state)
            cbar = plt.colorbar(orientation='vertical')
            cbar.set_label('Counts')
            plt.show()

        # Bunch up all the samples for all the states.
        all_samples = np.vstack(
            [samples_dict[state].flatten() for state in states_dict])
        # statewise_means = np.expand_dims(np.mean(all_samples, axis=1), axis=1)
        # statewise_devs = np.expand_dims(np.std(all_samples, axis=1), axis=1)
        normalized_samples = (all_samples - np.mean(all_samples))

        # Compute the SVD - Singular Value Decomposition.
        U, S, VH = np.linalg.svd(normalized_samples, full_matrices=False)

        # Compute reconstruction errors from an SVD.
        for num_svd_components in [1, 2, 5, 8, 10]:

            # Don't have these many components!
            if num_svd_components > num_actual_states:
                break

            # Compute reconstruction using these many SVD components.
            reconstruction = np.matmul(
                U[:, :num_svd_components] * S[:num_svd_components],
                VH[:num_svd_components, :])
            reconstruction_errors = np.sqrt(
                np.mean(np.square(reconstruction - normalized_samples),
                        axis=1))

            # Plot mean reconstruction error as a function of the states.
            used_states = list(states_dict.iterkeys())
            plt.scatter(used_states, reconstruction_errors)
            plt.ylabel('Mean Reconstruction Error Over All Samples')
            plt.xlabel('HMM State Number')
            plt.title(
                'Reconstruction Error after Retaining %d SVD Components' %
                num_svd_components)
            plt.ylim(bottom=0)
            plt.xticks(used_states)
            plt.show()

    # Plots the AIC and BIC values, as the number of states is varied.
    if show_information_curves:

        print 'Plotting AIC and BIC curves...'

        bics = []
        aics = []
        num_states_range = np.arange(1, 21)
        for num_states in num_states_range:
            model, _, _, _, _ = hmm_analysis(
                (counts, energy_ranges, times),
                hmm_type=hmm_type,
                num_states=num_states,
                num_pca_components=num_pca_components,
                evaluation=False)

            # Compute AIC and BIC criteria.
            bics.append(model.bic_value())
            aics.append(model.aic_value())

        fig, ax = plt.subplots(nrows=1)
        ax.plot(num_states_range, bics, label='BIC')
        ax.plot(num_states_range, aics, label='AIC')
        ax.legend()
        ax.set_xticks(num_states_range)
        ax.set_ylabel('Information Criteria Value')
        ax.set_xlabel('Number of States')

        # Add only supplied parameters to title.
        parameter_strings = ['PCA Components = %d', 'Stickiness = %0.2f']
        parameters = [num_states, num_pca_components, stickiness]
        parameter_string = ', '.join([
            parameter_string % parameter for parameter, parameter_string in
            zip(parameters, parameter_strings) if parameter is not None
        ])
        title = 'Selecting the Number of States via Information Criteria \n %s-HMM \n %s' % (
            formatted_name[hmm_type], parameter_string)

        ax.set_title(title)

        plt.show()
Esempio n. 7
0
def main(els_data_file, output_file, perform_hyperparameter_estimation,
         load_from_file, save_to_file, quantity, start_time, end_time,
         run_tests, plot_processed_sequence, k, n):

    # Check input arguments - start and end times should be valid.
    if start_time is not None:
        try:
            start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M')
        except ValueError:
            raise
    else:
        start_time = datetime.min

    if end_time is not None:
        try:
            end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(
                second=59, microsecond=999999)
        except ValueError:
            raise
    else:
        end_time = datetime.max

    # Run doctests.
    if run_tests:
        import doctest
        import data_utils
        doctest.testmod(data_utils,
                        verbose=True,
                        optionflags=doctest.NORMALIZE_WHITESPACE)
        doctest.testmod(verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE)

    # RuLSIF parameter.
    alpha = 0.1

    # Set random seed for reproducibility.
    random_seed = 7
    np.random.seed(random_seed)

    # Load processed sequence, if file found.
    els_sequence_file = os.path.splitext(els_data_file)[0] + '_RuLSIF_sequence'
    if load_from_file and os.path.exists(els_sequence_file + '.npz'):
        print 'Loading processed sequence from sequence file...'
        filedata = np.load(els_sequence_file + '.npz')
        counts_packed = filedata['counts_packed']
        energy_range = filedata['energy_range']
        times = filedata['times']
        dmed = filedata['dmed']
    else:
        print 'Sequence file not found. Extracting data from original ELS file and processing...'
        counts, energy_range, times = get_ELS_data(els_data_file, quantity,
                                                   start_time, end_time)

        # import pdb; pdb.set_trace() # For debugging.

        # Process counts.
        # counts = gaussian_blur(counts, sigma=0.5)
        # counts = np.ma.log(counts)

        # See the sequence plotted (lineplot for 1D data, colourplot for 2D data).
        if plot_processed_sequence:
            print 'Plotting processed sequence...'
            fig, ax = plt.subplots(1, 1)
            ax.set_title('Processed Sequence')

            if len(counts.shape) == 1:
                ax.xaxis_date()
                ax.xaxis.set_major_formatter(
                    mdates.DateFormatter('%d-%m-%Y/%H:%M'))
                fig.autofmt_xdate()
                ax.plot(times, counts)

            elif len(counts.shape) == 2:
                plt.imshow(counts.T, origin='lower', interpolation='none')
                ax.set_aspect('auto')
                plt.colorbar(ax=ax, orientation='vertical')

            plt.show()

        # Pack sequence into blocks.
        print 'Packing sequence into blocks...'
        counts_packed = pack(counts, k)
        print 'Sequence packed into shape %s.' % (counts_packed.shape, )

        # Median distance between subsequences.
        print 'Computing median distance between packed samples...'
        dmed = get_median_pairwise_distance(counts_packed)
        print 'Median distance between packed samples, dmed =', dmed

        # Save values to file.
        if save_to_file:
            arrays_with_names = {
                'counts_packed': counts_packed,
                'energy_range': energy_range,
                'times': times,
                'dmed': np.array(dmed)
            }
            np.savez(els_sequence_file, **arrays_with_names)

    # Range of values the hyperparameters were supposed to take, according to the reference.
    sigma_range = np.array([dmed])
    sigma_forward_range = sigma_backward_range = sigma_range
    lambda_range = np.array([1e-3, 1e-2, 1e-1, 1e0, 1e1])
    lambda_forward_range = lambda_backward_range = lambda_range

    # Restrict range further by taking the most common hyperparameters selected for fitting random samples.
    if perform_hyperparameter_estimation:
        els_hyperparameters_file = os.path.splitext(
            els_data_file)[0] + '_RuLSIF_hyperparameters'
        if load_from_file and os.path.exists(els_hyperparameters_file +
                                             '.npz'):
            print 'Hyperparameters file found. Loading from file...'
            filedata = np.load(els_hyperparameters_file + '.npz')
            sigma_forward_range = filedata['sigma_forward_range']
            sigma_backward_range = filedata['sigma_backward_range']
            lambda_forward_range = filedata['lambda_forward_range']
            lambda_backward_range = filedata['lambda_backward_range']
        else:
            print 'Hyperparameters file not found. Performing estimation...'
            sigma_forward_range, sigma_backward_range, \
            lambda_forward_range, lambda_backward_range = \
                estimate_hyperparameters(counts_packed, window_size=n,
                                         sigma_range=sigma_range,
                                         lambda_range=lambda_range,
                                         alpha=alpha, num_rank=2)

            if save_to_file:
                arrays_with_names = {
                    'sigma_forward_range': sigma_forward_range,
                    'sigma_backward_range': sigma_backward_range,
                    'lambda_forward_range': lambda_forward_range,
                    'lambda_backward_range': lambda_backward_range
                }
                np.savez(els_hyperparameters_file, **arrays_with_names)

    print 'Hyperparameters will be selected from the ranges:'
    print 'sigma_forward_range =', sigma_forward_range
    print 'sigma_backward_range =', sigma_backward_range
    print 'lambda_forward_range =', lambda_forward_range
    print 'lambda_backward_range =', lambda_backward_range

    # Change-point scores.
    packed_sequence_size = counts_packed.shape[0]
    original_sequence_size = counts.shape[0]
    scores = np.ma.masked_all(original_sequence_size)

    # Start timing here.
    timing_start = datetime.now()

    # Sliding-window over packed sequence.
    for i in range(n, packed_sequence_size - n + 1):
        forward_window = counts_packed[i:i + n]
        backward_window = counts_packed[i - n:i]
        forward_density_obj = densratio(backward_window,
                                        forward_window,
                                        alpha=alpha,
                                        sigma_range=sigma_forward_range,
                                        lambda_range=lambda_forward_range,
                                        verbose=False)
        forward_divergence = forward_density_obj.alpha_PE
        backward_density_obj = densratio(forward_window,
                                         backward_window,
                                         alpha=alpha,
                                         sigma_range=sigma_backward_range,
                                         lambda_range=lambda_backward_range,
                                         verbose=False)
        backward_divergence = backward_density_obj.alpha_PE
        change_point_score = forward_divergence + backward_divergence

        # Use larger range of hyperparameters if we can't get a good fit with the smaller one.
        if change_point_score < 0:
            print 'Bad fit with forward sigma = %0.2f, backward sigma = %0.2f.' % (
                forward_density_obj.kernel_info.sigma,
                backward_density_obj.kernel_info.sigma)
            sigma_range = np.array([
                0.7 * dmed, 0.8 * dmed, 0.9 * dmed, dmed, 1.1 * dmed,
                1.2 * dmed, 1.3 * dmed
            ])

            forward_density_obj = densratio(backward_window,
                                            forward_window,
                                            alpha=alpha,
                                            sigma_range=sigma_range,
                                            verbose=False)
            forward_divergence = forward_density_obj.alpha_PE
            backward_density_obj = densratio(forward_window,
                                             backward_window,
                                             alpha=alpha,
                                             sigma_range=sigma_range,
                                             verbose=False)
            backward_divergence = backward_density_obj.alpha_PE

            change_point_score = forward_divergence + backward_divergence

            print 'Tried again with forward sigma = %0.2f, backward sigma = %0.2f.' % (
                forward_density_obj.kernel_info.sigma,
                backward_density_obj.kernel_info.sigma)

        scores[i + k // 2] = change_point_score
        print 'Change-point score at time %s computed as %0.4f.' % (
            datetime.strftime(mdates.num2date(times[i]),
                              '%d-%m-%Y/%H:%M'), scores[i])

    # End time.
    timing_end = datetime.now()

    # Compute average time taken.
    total_time = (timing_end - timing_start).total_seconds()
    num_evals = packed_sequence_size - 2 * n + 1
    print '%0.2f seconds taken for %d change-point score evaluations. Average is %0.2f evals/sec, with k = %d, and n = %d.' % \
          (total_time, num_evals, num_evals/total_time, k, n)

    # Mask negative change-point scores.
    scores = np.ma.masked_less(scores, 0)

    # Plot change-point scores over windows as well as the original data.
    print 'Plotting...'

    fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True)

    plot_raw_ELS_data(fig,
                      ax0,
                      els_data_file,
                      quantity,
                      start_time,
                      end_time,
                      colorbar_range='subset',
                      colorbar_orientation='horizontal')

    ax1.plot(times, scores)
    ax1.set_ylabel('Change-point Score')
    ax1.xaxis.set_tick_params(labelsize=8)

    # Place title below.
    fig.text(s='Change-point Scores for ELS Data \n k = %d, n = %d' % (k, n),
             x=0.5,
             y=0.03,
             horizontalalignment='center',
             fontsize=13)

    plt.subplots_adjust(bottom=0.3, left=0.2)

    # Save plot.
    if output_file is None:
        plt.show()
    else:
        plt.savefig(output_file, bbox_inches='tight')

    # Save scores.
    if save_to_file:
        rulsif_output_file = os.path.splitext(
            els_data_file)[0] + '_RuLSIF_output'
        arrays_with_names = {'scores': scores, 'times': times}
        np.savez(rulsif_output_file, **arrays_with_names)
Esempio n. 8
0
    '/halo_nobackup/image-content/ameyasd/europa-onboard-science/src/caps_els/'
)  # Hack to import correctly.
from data_utils import get_ELS_data

DATA_DIR = '/halo_nobackup/image-content/ameyasd/crossings_updated/data/'
OUTPUT_DIR = '/halo_nobackup/image-content/ameyasd/optimization/training_set_statistics/'
file_list = set([
    os.path.splitext(file_name)[0] for file_name in os.listdir(DATA_DIR)
    if '2004' in file_name
])

energy_ranges = {}

for file_no_ext in file_list:
    file_full = DATA_DIR + file_no_ext + '.DAT'
    counts, file_energy_range, times = get_ELS_data(file_full, 'anode5',
                                                    datetime.min, datetime.max)

    min_energy_range = np.min(file_energy_range[0])
    max_energy_range = np.max(file_energy_range[0])

    energy_ranges[file_no_ext] = [min_energy_range, max_energy_range]

with open(OUTPUT_DIR + 'energy_ranges.txt', 'w') as f:
    f.write('File MinER MaxER \n')
    for file_no_ext, file_stats in energy_ranges.items():
        min_energy_range, max_energy_range = file_stats
        f.write('%s %0.10f %0.10f \n' %
                (file_no_ext, min_energy_range, max_energy_range))

for index, file_no_ext in enumerate(energy_ranges):
    plt.plot([index, index], energy_ranges[file_no_ext], lw=10)
Esempio n. 9
0
def plot_interpolated_ELS_data(figure, axes, els_data_file, quantity, start_time=datetime.min, end_time=datetime.max, colorbar_range='subset', colorbar_orientation='vertical', verbose=True, **kwargs):
    """
    Plots interpolated ELS data in a suitable time range on the given figure and axes.

    :param figure: figure matplotlib object
    :param axes: axes matplotlib object
    :param els_data_file: path of ELS .DAT file
    :param quantity: string indicating the quantity to be extracted from the ELS object
    :param start_time: datetime object indicating the start time of data to plot
    :param end_time: datetime object indicating the end time of data to plot
    :param colorbar_range: string indicating whether to use the entire data ('full') or only the subset being plotted ('subset') for setting colorbar range.
    :param colorbar_orientation: string indicating the orientation of the colorbar
    :param verbose: boolean indicating whether to print logging lines.
    :param blur_sigma: Parameter sigma (in timesteps) for the Gaussian kernel.
    :param bin_selection: Selection of ELS bins.
    :param filter: Filter to be applied bin-wise after the Gaussian blur.
    :param filter_size: Size of the filter to be applied after the Gaussian blur.
    """

    # We have to import here, because of an ImportError due to cyclic dependencies.
    from data_utils import get_ELS_data

    # Extract data.
    counts, energy_ranges, times = get_ELS_data(els_data_file, quantity, start_time, end_time, **kwargs)

    # Colorbar range.
    if colorbar_range == 'full':

        # Set colorbar max and min based on the entire *raw* ELS data in this file.
        els_object = ELS(els_data_file)
        raw_counts = parse_quantity(els_object, quantity)[0]
        raw_counts = raw_counts[~np.isnan(raw_counts)]
        vmin = np.min(raw_counts[raw_counts > 0])
        vmax = np.max(raw_counts)

    elif colorbar_range == 'subset':

        # Set colorbar max and min based on the *raw* subset being plotted.
        els_object = ELS(els_data_file)
        raw_counts = parse_quantity(els_object, quantity)[0]
    
        # If a datetime object, convert to a matplotlib float date.
        try:
            xmin = mdates.date2num(start_time)
            xmax = mdates.date2num(end_time)
        except AttributeError:
            xmin = start_time
            xmax = end_time
    
        mds = mdates.date2num(els_object.start_date)
        keep = np.where((mds >= xmin) & (mds <= xmax))[0]
        raw_counts = raw_counts[keep, :]
        raw_counts = raw_counts[~np.isnan(raw_counts)]
        vmin = np.min(raw_counts[raw_counts > 0])
        vmax = np.max(raw_counts)

    elif colorbar_range == 'interpolated_full':

        # Set colorbar max and min based on the entire *interpolated* ELS data in this file.
        all_counts = get_ELS_data(els_data_file, quantity, datetime.min, datetime.max)[0]
        vmin = np.min(all_counts[all_counts > 0])
        vmax = np.max(all_counts)

    elif colorbar_range == 'interpolated_subset':

        # Set colorbar max and min based on the *interpolated* subset being plotted.
        vmin = np.min(counts[counts > 0])
        vmax = np.max(counts)

    else:
        raise ValueError('Invalid value for \'colorbar_range\'.')

    if verbose:
        print('Colorbar Range:')
        print('- vmin = %0.2f' % vmin)
        print('- vmax = %0.2f' % vmax)

    # Plot.
    mesh = axes.pcolormesh(times, energy_ranges[0], counts.T, norm=LogNorm(vmin=vmin, vmax=vmax))

    # Add labels and ticks.
    axes.set_aspect('auto')
    axes.set_yscale('log')
    axes.set_xlabel('Date/Time')
    axes.set_ylabel('Energy (eV/q)')
    axes.xaxis_date()
    axes.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y/%H:%M'))
    axes.xaxis.set_tick_params(labelsize=8)

    # Tilts dates to the left for easier reading.
    plt.setp(axes.get_xticklabels(), rotation=30, ha='right')

    # Add colorbar with label.
    cbar = add_colorbar(mesh, figure, axes, colorbar_orientation)
    cbar.set_label('Interpolated Counts / s')