def check_labels(els_file_name, labels_file_name): # Check if the labels file exists. if not os.path.exists(labels_file_name): return False # Load labels. with open(labels_file_name, 'r') as labels_file_object: labels = yaml.safe_load(labels_file_object) crossings = labels['change_points'] # Convert to float (unit days). crossing_floats = datestring_to_float(crossings) # Check if atleast one label is valid. atleast_one_valid_label = False for crossing_float in crossing_floats: try: times = get_ELS_data(els_file_name, quantity='anode5', start_time=datetime.min, end_time=datetime.max)[2] if times[0] <= crossing_float <= times[-1]: atleast_one_valid_label = True break except ValueError: pass return atleast_one_valid_label
def compute_pca_components(BLUR_SIGMA, BIN_SELECTION, FILTER, FILTER_SIZE, TRANSFORM, **kwargs): # We compute PCA components using all the training data. file_list = list_of_ELS_files(DATA_DIR, CROSSINGS_DIR + 'new_labels/all/', MODE.replace('test', 'train')) # Append all ELS data one-by-one. all_counts = None for data_file in file_list: file_full = DATA_DIR + data_file counts, energy_range, times = get_ELS_data(file_full, 'anode5', datetime.min, datetime.max, blur_sigma=BLUR_SIGMA, bin_selection=BIN_SELECTION, filter=FILTER, filter_size=FILTER_SIZE) if all_counts is None: all_counts = counts else: all_counts = np.append(all_counts, counts, axis=0) # Apply transformation. all_counts = Transformation(TRANSFORM).transform(all_counts) # Learn PCA components from data. pca = PCA(n_components=10) pca.fit(all_counts) # We compute PCA components for these dimensions only. return { n_components: pca.components_[:n_components] for n_components in [1, 2, 5, 10] }
def main(els_data_file, quantity, start_time, end_time, output_file, **kwargs): # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time = datetime.min if end_time is not None: try: end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace( second=59, microsecond=999999) except ValueError: raise else: end_time = datetime.max # Get data. data = get_ELS_data(els_data_file, quantity, start_time, end_time, **kwargs) # Compute scores. times, scores = baseline_analysis(data) # Plot ELS data. print 'Plotting...' fig, axs = plt.subplots(nrows=2, sharex=True) plot_interpolated_ELS_data(fig, axs[0], els_data_file, quantity, start_time, end_time, colorbar_range='subset', colorbar_orientation='horizontal', **kwargs) axs[0].set_xlabel('') # Plot scores. axs[1].plot(times, scores) axs[1].set_xlabel('Date/Time') axs[1].set_ylabel('Change-point Score') axs[1].xaxis.set_tick_params(labelsize=8) axs[1].margins(0, 0) plt.setp(axs[1].get_xticklabels(), rotation=30, ha='right') # Place title below. if kwargs['filter'] is None: title_subtext = 'Blur %d' % (kwargs['blur_sigma']) else: title_subtext = 'Blur %d, Filter %s of Size %d' % ( kwargs['blur_sigma'], kwargs['filter'], kwargs['filter_size']) fig.text(s='Change-point Scores for ELS Data \n %s' % title_subtext, x=0.5, y=0.03, horizontalalignment='center', fontsize=13) plt.subplots_adjust(bottom=0.3, left=0.2) # Save plot. if output_file is None: plt.show() else: plt.savefig(output_file, bbox_inches='tight')
def main(els_data_file, output_file, quantity, start_time, end_time, run_tests, plot_processed_sequence, window_size, discord_dimensions, num_pca_components, ignored_dimensions, required_dimensions, std_noise): # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time = datetime.min if end_time is not None: try: end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace( second=59, microsecond=999999) except ValueError: raise else: end_time = datetime.max # Run doctests. if run_tests: import doctest import data_utils doctest.testmod(data_utils, verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) doctest.testmod(verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) # Set random seed for reproducibility. random_seed = 7 np.random.seed(random_seed) # Load data from ELS DAT file. ELS_data = get_ELS_data(els_data_file, quantity, start_time, end_time) # Get matrix profile, padded to match length of the original sequence. times, profile = matrix_profile( ELS_data, window_size, discord_dimensions, std_noise, num_pca_components=num_pca_components, ignored_dimensions=ignored_dimensions, required_dimensions=required_dimensions, plot_processed_sequence=plot_processed_sequence, verbose=True) # Plot change-point scores over windows as well as the original data. print 'Plotting...' fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True) plot_raw_ELS_data(fig, ax0, els_data_file, quantity, start_time, end_time, colorbar_range='subset', colorbar_orientation='horizontal') ax1.plot(times, profile) ax1.set_ylabel('Non-Self NN-Distance') ax1.xaxis.set_tick_params(labelsize=8) # Add only supplied parameters to title. parameter_strings = [ 'Window Size = %d', 'Dimensions = %d', 'PCA Components = %d', 'Noise Correction = %0.2f' ] parameters = [ window_size, discord_dimensions, num_pca_components, std_noise ] parameter_string = ', '.join([ parameter_string % parameter for parameter, parameter_string in zip(parameters, parameter_strings) if parameter is not None ]) title = 'Matrix Profile on CAPS ELS \n %s' % parameter_string # Place title below. fig.text(s=title, y=0.03, x=0.5, horizontalalignment='center', fontsize=13) plt.subplots_adjust(bottom=0.3, left=0.2) # Save plot. if output_file is None: plt.show() else: plt.savefig(output_file, bbox_inches='tight')
def main(data_file, results_file, algorithm_name, dataset, quantity, start_time, end_time, anomaly_type, blur_sigma, bin_selection, filter, filter_size, **kwargs): # Seed for reproducibility. np.random.seed(7) # Create the directory for the results file. results_directory = os.path.dirname(results_file) if results_directory != '' and not os.path.exists(results_directory): Path(results_directory).mkdir(parents=True, exist_ok=True) # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time_dt = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time_dt = datetime.min if end_time is not None: try: end_time_dt = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace(second=59, microsecond=999999) except ValueError: raise else: end_time_dt = datetime.max # The time-series data loaded. if dataset == 'els': print 'ELS Bin Selection:', bin_selection data = get_ELS_data(data_file, quantity, start_time_dt, end_time_dt, blur_sigma, bin_selection, filter, filter_size) elif dataset == 'intel': data = get_Intel_data(data_file, 'temperature', start_time_dt, end_time_dt, downsample_rate='20min', drop_sensors=[5, 15, 18]) else: raise ValueError('Invalid dataset.') # Update start and end times, to match the data. _, _, datatimes = data start_time = datatimes[0] end_time = datatimes[-1] print 'Data start time:', float_to_datestring(start_time) print 'Data end time:', float_to_datestring(end_time) # The function representing the entry point for the algorithm. func = algorithm_name_map[algorithm_name] # The list of arguments this function expects. # We exclude the first argument, which will be the input data. args = signature(func).parameters.items()[1:] # Arguments (parameters) being passed from the command-line. # Each argument should have a default value in the original function, or be set from the command-line here. for arg, val in args: if val.default is Parameter.empty and kwargs.get(arg) is None: raise ValueError('Please pass a value for argument %s.' % arg) parameters = {arg: kwargs.get(arg) for arg, val in args if kwargs.get(arg) is not None} # Add the anomaly type as a parameter. # parameters['anomaly_type'] = anomaly_type # Call function with arguments! print 'Evaluating function with given parameters...' results, time_taken = timed(func)(data, **parameters) times, scores = results print 'Function evaluation complete!' # Save results (and metadata) to file. data_dict = { 'times': times, 'scores': scores, 'time_taken': time_taken, 'anomaly_type': anomaly_type, 'quantity': quantity, 'start_time': start_time, 'end_time': end_time, 'data_file': data_file, 'dataset': dataset, 'algorithm_name': formatted_algorithm_names[algorithm_name], 'parameters': format_dict(parameters), 'blur_sigma': blur_sigma, 'bin_selection': bin_selection, 'filter': filter, 'filter_size': filter_size, } with h5py.File(results_file, 'w') as f: for key, val in data_dict.items(): f.create_dataset(key, data=val) print('Results saved to %s. Use evaluate_methods.py/evaluate_methods_time_tolerance.py to analyze results.' % results_file)
def main(els_data_file, output_file, quantity, start_time, end_time, hmm_type, num_states, num_pca_components, show_information_curves, visualize_states, stickiness, alpha, gamma, mixture_model, **kwargs): # Random seed for reproducibility. random_seed = 7 np.random.seed(random_seed) # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time = datetime.min if end_time is not None: try: end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace( second=59, microsecond=999999) except ValueError: raise else: end_time = datetime.max # Get data, and unpack. counts, energy_ranges, times = get_ELS_data(els_data_file, quantity, start_time, end_time, **kwargs) # Segment with an HMM. model, states, log_likelihood, states_dist, scores = \ hmm_analysis((counts, energy_ranges, times), hmm_type=hmm_type, num_states=num_states, num_pca_components=num_pca_components, stickiness=stickiness, alpha=alpha, gamma=gamma, mixture_model=mixture_model, verbose=True, evaluation=False) # Intervals of states to plot. states_dict = array_to_intervals(states) print '%d HMM states used to segment the time-series.' % len(states_dict) # Print individual state durations. for state, intervals in states_dict.iteritems(): print 'State %d durations: %s timesteps.' % ( state, np.sum( [interval[1] - interval[0] for interval in intervals])) # Plot ELS data on top. fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, sharex=True) plot_interpolated_ELS_data(fig, ax0, els_data_file, quantity, start_time, end_time, colorbar_range='subset', colorbar_orientation='horizontal', **kwargs) ax0.set_xlabel('') # Plot segments in different colours. We don't want to repeat colors across states. colors = cm.get_cmap('Set3') labelled_states = set() for index, (state, intervals) in enumerate(states_dict.iteritems()): for interval in intervals: if state in labelled_states: ax1.axvspan(times[interval[0]], times[interval[1]], color=colors(index)) else: ax1.axvspan(times[interval[0]], times[interval[1]], color=colors(index), label=state) labelled_states.add(state) ax1.margins(x=0) ax1.set_yticks([]) ax1.set_ylabel('HMM States', labelpad=10) ax1.legend(title='HMM States', bbox_to_anchor=(1.28, 0.5), loc='center right', fontsize=8) # Plot scores. ax2.plot(times, scores) ax2.set_ylabel('Scores') ax2.xaxis.set_tick_params(labelsize=8) ax2.set_xlabel('Datetime') plt.setp(ax2.get_xticklabels(), rotation=30, ha='right') # Place title below. formatted_name = { 'vanilla': 'Vanilla', 'hdp': 'HDP', 'stickyhdp': 'Sticky HDP', } hmm_parameters = { 'vanilla': ['num_states', 'num_pca_components'], 'hdp': ['num_states', 'num_pca_components', 'alpha', 'gamma'], 'stickyhdp': ['num_states', 'num_pca_components', 'alpha', 'gamma', 'stickiness'], } parameter_strings = { 'num_states': 'HMM States = %d', 'num_pca_components': 'PCA Components = %d', 'stickiness': 'Stickiness = %0.2f', 'alpha': 'Dirichlet Alpha = %0.2f', 'gamma': 'Dirichlet Gamma = %0.2f', } # Add only supplied parameters to title. parameters = hmm_parameters[hmm_type] parameter_string_all = ', '.join([ parameter_strings[parameter] % locals()[parameter] for parameter in parameters ]) title = 'CAPS ELS Segmentation with %s-HMM \n %s \n Log-Likelihood = %0.2f' \ % (formatted_name[hmm_type], parameter_string_all, log_likelihood) fig.text(s=title, x=0.5, y=0.03, horizontalalignment='center', fontsize=13) plt.subplots_adjust(bottom=0.4, left=0.1, right=0.8) # Save plot. if output_file is None: plt.show() else: plt.savefig(output_file, bbox_inches='tight') # Visualization of individual states. if visualize_states: # Get state-wise emission parameters, to compute distances between states. states_map = {index: state for index, state in enumerate(states_dict)} num_actual_states = len(states_map) all_emission_params = { index: model.emission_params(state) for index, state in enumerate(states_dict) } dissimilarities = np.zeros((num_actual_states, num_actual_states)) for (index1, params1), (index2, params2) in itertools.product( all_emission_params.iteritems(), all_emission_params.iteritems()): dissimilarities[index1][index2] = kl_divergence_normals(params1['mu'], params1['sigma'], params2['mu'], params2['sigma']) \ + kl_divergence_normals(params2['mu'], params2['sigma'], params1['mu'], params1['sigma']) transformed_states = MDS( dissimilarity='precomputed').fit_transform(dissimilarities) spacing = 5 / np.max(transformed_states) colors = cm.get_cmap('Set3') for index, _ in enumerate(transformed_states): plt.scatter(transformed_states[index, 0], transformed_states[index, 1], label=states_map[index], color=colors(index)) plt.text(transformed_states[index, 0], transformed_states[index, 1] + spacing, states_map[index], fontsize=8) plt.title('MDS Plot of HMM States') plt.show() # Recreate PCA object. apply_pca = num_pca_components is not None and num_pca_components > 0 if apply_pca: pca = PCA(n_components=num_pca_components) pca.fit(counts) data_mean = np.mean(counts, axis=0) # Samples hidden states to visualize them. num_samples = 50 samples_dict = {} for state in states_dict: # Sample from the learned observation distribution for each state. samples_dict[state] = model.generate_samples(state, num_samples) # If we applied PCA, project back into the original number of dimensions. if apply_pca: samples_dict[state] = reconstruct_from_PCA( samples_dict[state], data_mean, pca) # Set minimum and maximum values across plots for consistency. samples_list = list(samples_dict.itervalues()) vmin = np.percentile(samples_list, 5) vmax = np.percentile(samples_list, 95) # Plot the projected samples! for state in states_dict: plt.ylabel('Energies') plt.imshow(samples_dict[state].T, extent=[0, num_samples, 0, len(energy_ranges[0])], origin='upper', interpolation='none', vmin=vmin, vmax=vmax) plt.yticks(np.arange(0, len(energy_ranges[0]), 5), energy_ranges[0][::5]) plt.title('HMM State %d \n Transformed Samples' % state) cbar = plt.colorbar(orientation='vertical') cbar.set_label('Counts') plt.show() # Bunch up all the samples for all the states. all_samples = np.vstack( [samples_dict[state].flatten() for state in states_dict]) # statewise_means = np.expand_dims(np.mean(all_samples, axis=1), axis=1) # statewise_devs = np.expand_dims(np.std(all_samples, axis=1), axis=1) normalized_samples = (all_samples - np.mean(all_samples)) # Compute the SVD - Singular Value Decomposition. U, S, VH = np.linalg.svd(normalized_samples, full_matrices=False) # Compute reconstruction errors from an SVD. for num_svd_components in [1, 2, 5, 8, 10]: # Don't have these many components! if num_svd_components > num_actual_states: break # Compute reconstruction using these many SVD components. reconstruction = np.matmul( U[:, :num_svd_components] * S[:num_svd_components], VH[:num_svd_components, :]) reconstruction_errors = np.sqrt( np.mean(np.square(reconstruction - normalized_samples), axis=1)) # Plot mean reconstruction error as a function of the states. used_states = list(states_dict.iterkeys()) plt.scatter(used_states, reconstruction_errors) plt.ylabel('Mean Reconstruction Error Over All Samples') plt.xlabel('HMM State Number') plt.title( 'Reconstruction Error after Retaining %d SVD Components' % num_svd_components) plt.ylim(bottom=0) plt.xticks(used_states) plt.show() # Plots the AIC and BIC values, as the number of states is varied. if show_information_curves: print 'Plotting AIC and BIC curves...' bics = [] aics = [] num_states_range = np.arange(1, 21) for num_states in num_states_range: model, _, _, _, _ = hmm_analysis( (counts, energy_ranges, times), hmm_type=hmm_type, num_states=num_states, num_pca_components=num_pca_components, evaluation=False) # Compute AIC and BIC criteria. bics.append(model.bic_value()) aics.append(model.aic_value()) fig, ax = plt.subplots(nrows=1) ax.plot(num_states_range, bics, label='BIC') ax.plot(num_states_range, aics, label='AIC') ax.legend() ax.set_xticks(num_states_range) ax.set_ylabel('Information Criteria Value') ax.set_xlabel('Number of States') # Add only supplied parameters to title. parameter_strings = ['PCA Components = %d', 'Stickiness = %0.2f'] parameters = [num_states, num_pca_components, stickiness] parameter_string = ', '.join([ parameter_string % parameter for parameter, parameter_string in zip(parameters, parameter_strings) if parameter is not None ]) title = 'Selecting the Number of States via Information Criteria \n %s-HMM \n %s' % ( formatted_name[hmm_type], parameter_string) ax.set_title(title) plt.show()
def main(els_data_file, output_file, perform_hyperparameter_estimation, load_from_file, save_to_file, quantity, start_time, end_time, run_tests, plot_processed_sequence, k, n): # Check input arguments - start and end times should be valid. if start_time is not None: try: start_time = datetime.strptime(start_time, '%d-%m-%Y/%H:%M') except ValueError: raise else: start_time = datetime.min if end_time is not None: try: end_time = datetime.strptime(end_time, '%d-%m-%Y/%H:%M').replace( second=59, microsecond=999999) except ValueError: raise else: end_time = datetime.max # Run doctests. if run_tests: import doctest import data_utils doctest.testmod(data_utils, verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) doctest.testmod(verbose=True, optionflags=doctest.NORMALIZE_WHITESPACE) # RuLSIF parameter. alpha = 0.1 # Set random seed for reproducibility. random_seed = 7 np.random.seed(random_seed) # Load processed sequence, if file found. els_sequence_file = os.path.splitext(els_data_file)[0] + '_RuLSIF_sequence' if load_from_file and os.path.exists(els_sequence_file + '.npz'): print 'Loading processed sequence from sequence file...' filedata = np.load(els_sequence_file + '.npz') counts_packed = filedata['counts_packed'] energy_range = filedata['energy_range'] times = filedata['times'] dmed = filedata['dmed'] else: print 'Sequence file not found. Extracting data from original ELS file and processing...' counts, energy_range, times = get_ELS_data(els_data_file, quantity, start_time, end_time) # import pdb; pdb.set_trace() # For debugging. # Process counts. # counts = gaussian_blur(counts, sigma=0.5) # counts = np.ma.log(counts) # See the sequence plotted (lineplot for 1D data, colourplot for 2D data). if plot_processed_sequence: print 'Plotting processed sequence...' fig, ax = plt.subplots(1, 1) ax.set_title('Processed Sequence') if len(counts.shape) == 1: ax.xaxis_date() ax.xaxis.set_major_formatter( mdates.DateFormatter('%d-%m-%Y/%H:%M')) fig.autofmt_xdate() ax.plot(times, counts) elif len(counts.shape) == 2: plt.imshow(counts.T, origin='lower', interpolation='none') ax.set_aspect('auto') plt.colorbar(ax=ax, orientation='vertical') plt.show() # Pack sequence into blocks. print 'Packing sequence into blocks...' counts_packed = pack(counts, k) print 'Sequence packed into shape %s.' % (counts_packed.shape, ) # Median distance between subsequences. print 'Computing median distance between packed samples...' dmed = get_median_pairwise_distance(counts_packed) print 'Median distance between packed samples, dmed =', dmed # Save values to file. if save_to_file: arrays_with_names = { 'counts_packed': counts_packed, 'energy_range': energy_range, 'times': times, 'dmed': np.array(dmed) } np.savez(els_sequence_file, **arrays_with_names) # Range of values the hyperparameters were supposed to take, according to the reference. sigma_range = np.array([dmed]) sigma_forward_range = sigma_backward_range = sigma_range lambda_range = np.array([1e-3, 1e-2, 1e-1, 1e0, 1e1]) lambda_forward_range = lambda_backward_range = lambda_range # Restrict range further by taking the most common hyperparameters selected for fitting random samples. if perform_hyperparameter_estimation: els_hyperparameters_file = os.path.splitext( els_data_file)[0] + '_RuLSIF_hyperparameters' if load_from_file and os.path.exists(els_hyperparameters_file + '.npz'): print 'Hyperparameters file found. Loading from file...' filedata = np.load(els_hyperparameters_file + '.npz') sigma_forward_range = filedata['sigma_forward_range'] sigma_backward_range = filedata['sigma_backward_range'] lambda_forward_range = filedata['lambda_forward_range'] lambda_backward_range = filedata['lambda_backward_range'] else: print 'Hyperparameters file not found. Performing estimation...' sigma_forward_range, sigma_backward_range, \ lambda_forward_range, lambda_backward_range = \ estimate_hyperparameters(counts_packed, window_size=n, sigma_range=sigma_range, lambda_range=lambda_range, alpha=alpha, num_rank=2) if save_to_file: arrays_with_names = { 'sigma_forward_range': sigma_forward_range, 'sigma_backward_range': sigma_backward_range, 'lambda_forward_range': lambda_forward_range, 'lambda_backward_range': lambda_backward_range } np.savez(els_hyperparameters_file, **arrays_with_names) print 'Hyperparameters will be selected from the ranges:' print 'sigma_forward_range =', sigma_forward_range print 'sigma_backward_range =', sigma_backward_range print 'lambda_forward_range =', lambda_forward_range print 'lambda_backward_range =', lambda_backward_range # Change-point scores. packed_sequence_size = counts_packed.shape[0] original_sequence_size = counts.shape[0] scores = np.ma.masked_all(original_sequence_size) # Start timing here. timing_start = datetime.now() # Sliding-window over packed sequence. for i in range(n, packed_sequence_size - n + 1): forward_window = counts_packed[i:i + n] backward_window = counts_packed[i - n:i] forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_forward_range, lambda_range=lambda_forward_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_backward_range, lambda_range=lambda_backward_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence # Use larger range of hyperparameters if we can't get a good fit with the smaller one. if change_point_score < 0: print 'Bad fit with forward sigma = %0.2f, backward sigma = %0.2f.' % ( forward_density_obj.kernel_info.sigma, backward_density_obj.kernel_info.sigma) sigma_range = np.array([ 0.7 * dmed, 0.8 * dmed, 0.9 * dmed, dmed, 1.1 * dmed, 1.2 * dmed, 1.3 * dmed ]) forward_density_obj = densratio(backward_window, forward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) forward_divergence = forward_density_obj.alpha_PE backward_density_obj = densratio(forward_window, backward_window, alpha=alpha, sigma_range=sigma_range, verbose=False) backward_divergence = backward_density_obj.alpha_PE change_point_score = forward_divergence + backward_divergence print 'Tried again with forward sigma = %0.2f, backward sigma = %0.2f.' % ( forward_density_obj.kernel_info.sigma, backward_density_obj.kernel_info.sigma) scores[i + k // 2] = change_point_score print 'Change-point score at time %s computed as %0.4f.' % ( datetime.strftime(mdates.num2date(times[i]), '%d-%m-%Y/%H:%M'), scores[i]) # End time. timing_end = datetime.now() # Compute average time taken. total_time = (timing_end - timing_start).total_seconds() num_evals = packed_sequence_size - 2 * n + 1 print '%0.2f seconds taken for %d change-point score evaluations. Average is %0.2f evals/sec, with k = %d, and n = %d.' % \ (total_time, num_evals, num_evals/total_time, k, n) # Mask negative change-point scores. scores = np.ma.masked_less(scores, 0) # Plot change-point scores over windows as well as the original data. print 'Plotting...' fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True) plot_raw_ELS_data(fig, ax0, els_data_file, quantity, start_time, end_time, colorbar_range='subset', colorbar_orientation='horizontal') ax1.plot(times, scores) ax1.set_ylabel('Change-point Score') ax1.xaxis.set_tick_params(labelsize=8) # Place title below. fig.text(s='Change-point Scores for ELS Data \n k = %d, n = %d' % (k, n), x=0.5, y=0.03, horizontalalignment='center', fontsize=13) plt.subplots_adjust(bottom=0.3, left=0.2) # Save plot. if output_file is None: plt.show() else: plt.savefig(output_file, bbox_inches='tight') # Save scores. if save_to_file: rulsif_output_file = os.path.splitext( els_data_file)[0] + '_RuLSIF_output' arrays_with_names = {'scores': scores, 'times': times} np.savez(rulsif_output_file, **arrays_with_names)
'/halo_nobackup/image-content/ameyasd/europa-onboard-science/src/caps_els/' ) # Hack to import correctly. from data_utils import get_ELS_data DATA_DIR = '/halo_nobackup/image-content/ameyasd/crossings_updated/data/' OUTPUT_DIR = '/halo_nobackup/image-content/ameyasd/optimization/training_set_statistics/' file_list = set([ os.path.splitext(file_name)[0] for file_name in os.listdir(DATA_DIR) if '2004' in file_name ]) energy_ranges = {} for file_no_ext in file_list: file_full = DATA_DIR + file_no_ext + '.DAT' counts, file_energy_range, times = get_ELS_data(file_full, 'anode5', datetime.min, datetime.max) min_energy_range = np.min(file_energy_range[0]) max_energy_range = np.max(file_energy_range[0]) energy_ranges[file_no_ext] = [min_energy_range, max_energy_range] with open(OUTPUT_DIR + 'energy_ranges.txt', 'w') as f: f.write('File MinER MaxER \n') for file_no_ext, file_stats in energy_ranges.items(): min_energy_range, max_energy_range = file_stats f.write('%s %0.10f %0.10f \n' % (file_no_ext, min_energy_range, max_energy_range)) for index, file_no_ext in enumerate(energy_ranges): plt.plot([index, index], energy_ranges[file_no_ext], lw=10)
def plot_interpolated_ELS_data(figure, axes, els_data_file, quantity, start_time=datetime.min, end_time=datetime.max, colorbar_range='subset', colorbar_orientation='vertical', verbose=True, **kwargs): """ Plots interpolated ELS data in a suitable time range on the given figure and axes. :param figure: figure matplotlib object :param axes: axes matplotlib object :param els_data_file: path of ELS .DAT file :param quantity: string indicating the quantity to be extracted from the ELS object :param start_time: datetime object indicating the start time of data to plot :param end_time: datetime object indicating the end time of data to plot :param colorbar_range: string indicating whether to use the entire data ('full') or only the subset being plotted ('subset') for setting colorbar range. :param colorbar_orientation: string indicating the orientation of the colorbar :param verbose: boolean indicating whether to print logging lines. :param blur_sigma: Parameter sigma (in timesteps) for the Gaussian kernel. :param bin_selection: Selection of ELS bins. :param filter: Filter to be applied bin-wise after the Gaussian blur. :param filter_size: Size of the filter to be applied after the Gaussian blur. """ # We have to import here, because of an ImportError due to cyclic dependencies. from data_utils import get_ELS_data # Extract data. counts, energy_ranges, times = get_ELS_data(els_data_file, quantity, start_time, end_time, **kwargs) # Colorbar range. if colorbar_range == 'full': # Set colorbar max and min based on the entire *raw* ELS data in this file. els_object = ELS(els_data_file) raw_counts = parse_quantity(els_object, quantity)[0] raw_counts = raw_counts[~np.isnan(raw_counts)] vmin = np.min(raw_counts[raw_counts > 0]) vmax = np.max(raw_counts) elif colorbar_range == 'subset': # Set colorbar max and min based on the *raw* subset being plotted. els_object = ELS(els_data_file) raw_counts = parse_quantity(els_object, quantity)[0] # If a datetime object, convert to a matplotlib float date. try: xmin = mdates.date2num(start_time) xmax = mdates.date2num(end_time) except AttributeError: xmin = start_time xmax = end_time mds = mdates.date2num(els_object.start_date) keep = np.where((mds >= xmin) & (mds <= xmax))[0] raw_counts = raw_counts[keep, :] raw_counts = raw_counts[~np.isnan(raw_counts)] vmin = np.min(raw_counts[raw_counts > 0]) vmax = np.max(raw_counts) elif colorbar_range == 'interpolated_full': # Set colorbar max and min based on the entire *interpolated* ELS data in this file. all_counts = get_ELS_data(els_data_file, quantity, datetime.min, datetime.max)[0] vmin = np.min(all_counts[all_counts > 0]) vmax = np.max(all_counts) elif colorbar_range == 'interpolated_subset': # Set colorbar max and min based on the *interpolated* subset being plotted. vmin = np.min(counts[counts > 0]) vmax = np.max(counts) else: raise ValueError('Invalid value for \'colorbar_range\'.') if verbose: print('Colorbar Range:') print('- vmin = %0.2f' % vmin) print('- vmax = %0.2f' % vmax) # Plot. mesh = axes.pcolormesh(times, energy_ranges[0], counts.T, norm=LogNorm(vmin=vmin, vmax=vmax)) # Add labels and ticks. axes.set_aspect('auto') axes.set_yscale('log') axes.set_xlabel('Date/Time') axes.set_ylabel('Energy (eV/q)') axes.xaxis_date() axes.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y/%H:%M')) axes.xaxis.set_tick_params(labelsize=8) # Tilts dates to the left for easier reading. plt.setp(axes.get_xticklabels(), rotation=30, ha='right') # Add colorbar with label. cbar = add_colorbar(mesh, figure, axes, colorbar_orientation) cbar.set_label('Interpolated Counts / s')