Esempio n. 1
0
def run_workload_characterization(metric_data):
    # Performs workload characterization on the metric_data and returns
    # a set of pruned metrics.
    #
    # Parameters:
    #   metric_data is a dictionary of the form:
    #     - 'data': 2D numpy matrix of metric data (results x metrics)
    #     - 'rowlabels': a list of identifiers for the rows in the matrix
    #     - 'columnlabels': a list of the metric names corresponding to
    #                       the columns in the data matrix

    matrix = metric_data['data']
    columnlabels = metric_data['columnlabels']

    # Remove any constant columns
    nonconst_matrix = []
    nonconst_columnlabels = []
    for col, cl in zip(matrix.T, columnlabels):
        if np.any(col != col[0]):
            nonconst_matrix.append(col.reshape(-1, 1))
            nonconst_columnlabels.append(cl)
    assert len(nonconst_matrix) > 0, "Need more data to train the model"
    nonconst_matrix = np.hstack(nonconst_matrix)
    n_rows, n_cols = nonconst_matrix.shape

    # Bin each column (metric) in the matrix by its decile
    binner = Bin(bin_start=1, axis=0)
    binned_matrix = binner.fit_transform(nonconst_matrix)

    # Shuffle the matrix rows
    shuffle_indices = get_shuffle_indices(n_rows)
    shuffled_matrix = binned_matrix[shuffle_indices, :]

    # Fit factor analysis model
    fa_model = FactorAnalysis()
    # For now we use 5 latent variables
    fa_model.fit(shuffled_matrix, nonconst_columnlabels,
                 n_components=N_COMPONENTS)

    # Components: metrics * factors
    components = fa_model.components_.T.copy()

    # Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1)
    # K should be much smaller than n_cols in detK, For now max_cluster <= 20
    kmeans_models = KMeansClusters()
    kmeans_models.fit(components, min_cluster=1,
                      max_cluster=min(n_cols - 1, 20),
                      sample_labels=nonconst_columnlabels,
                      estimator_params={'n_init': 50})

    # Compute optimal # clusters, k, using gap statistics
    gapk = create_kselection_model("gap-statistic")
    gapk.fit(components, kmeans_models.cluster_map_)

    # Get pruned metrics, cloest samples of each cluster center
    pruned_metrics = kmeans_models.cluster_map_[
        gapk.optimal_num_clusters_].get_closest_samples()

    # Return pruned metrics
    return pruned_metrics
Esempio n. 2
0
def run_PCA(X):
    """Execute Principal Component Analysis. 
    Arg
      X : X data with column, row label. (Matrix)
    
    Return
      components : Result of pca in variance descending order. 
                   (Numpy array, [# of features, # of components]) 
      components_columnlabels : Labels for each componenets. (Numpy array, [# of features,]) 
    """
    
    
    #--------------
    # Execute PCA.
    #--------------
    
    pca = PCA()
    pca.fit(X.data)
    
    #------------------------------
    # Determine number of factors.
    #------------------------------
    
    # Only nonzero components should be considered.
    pca_mask = np.sum(pca.components_ != 0.0, axis=1) > 0.0

    # Select number of components which can explain REQUIRED_VARIANCE_EXPLAINED percent of variance.
    variances = pca.explained_variance_ratio_
    variances_explained_percent = np.array([np.sum(variances[:i+1]) * 100 for i in range(variances.shape[0])])
    component_cutoff = np.count_nonzero(variances_explained_percent < REQUIRED_VARIANCE_EXPLAINED) + 1
    component_cutoff = min(component_cutoff, 10)
    
    #print variances.
    print "component cutoff: {}".format(component_cutoff)
    for i,var in enumerate(variances):
        print i, var, np.sum(variances[:i+1]), np.sum(variances[:i+1])


    #----------------
    # Postprecessing
    #----------------
    
    # Standardization
    components = np.transpose(pca.components_[:component_cutoff]).copy()
    print "components shape: {}".format(components.shape)
    standardizer = StandardScaler()
    components = standardizer.fit_transform(components)
    
    # Shuffle factor analysis X rows. (metrics x factors)
    metric_shuffle_indices = get_shuffle_indices(components.shape[0])
    components = components[metric_shuffle_indices]
    
    # Make labels for each column.
    components_columnlabels = X.columnlabels[metric_shuffle_indices]
    
    return components, components_columnlabels
Esempio n. 3
0
def run_factor_analysis(X):
    """Execute factor analysis.
    Arg
      X : X data with column, row label. (Matrix)
    
    Return
      components : Result of factor analysis in variance descending order. 
                   (Numpy array, [# of features, # of components]) 
      components_columnlabels : Labels for each componenets. (Numpy array, [# of features,]) 
    """   
    
    
    #-------------------------      
    # Execute factor analysis
    #-------------------------
    
    fa = FactorAnalysis()
    # Feed X.data.T for reduction across feature axis, X.data for reduction across sample axis.
    fa.fit(X.data)


    #-----------------------------
    # Determine number of factors
    #-----------------------------
    
    # Only nonzero components should be considered.
    fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0

    # Calculate each variance(actually sum of absoulute value) and total variance
    variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1)
    total_variance = np.sum(variances).squeeze()
    print "total variance: {}".format(total_variance)
    
    # Select number of components which can explain REQUIRED_VARIANCE_EXPLAINED percent of variance.
    var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 
                        for i in range(variances.shape[0])])
    factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1
    factor_cutoff = min(factor_cutoff, 10)
    print "factor cutoff: {}".format(factor_cutoff)
    for i,var in enumerate(variances):
        print i, var, np.sum(variances[:i+1]), np.sum(variances[:i+1]) / total_variance


    #----------------
    # Postprecessing
    #----------------
    
    # Standardization
    components = np.transpose(fa.components_[:factor_cutoff]).copy()
    print "components shape: {}".format(components.shape)
    standardizer = StandardScaler()
    components = standardizer.fit_transform(components)
    
    # Shuffle factor analysis X rows. (metrics x factors)
    metric_shuffle_indices = get_shuffle_indices(components.shape[0])
    components = components[metric_shuffle_indices]
    
    # Make labels for each column.
    components_columnlabels = X.columnlabels[metric_shuffle_indices] 
    
    return (components, components_columnlabels)