def do_cluster(): label_names = pickletools.load(labels_filename) X_scaled = pickletools.load(normalized_data_filename) cluster_count = 10 print('Clustering...') kmeans = KMeans(n_clusters=cluster_count, max_iter=1000, n_jobs=4, tol=0.0001).fit(X_scaled) create_lists(cluster_count, kmeans, label_names) features = save_centroids(kmeans) find_interesting_features(cluster_count, features, kmeans)
def save_centroids(kmeans): print('Saving the centroids') features = pickletools.load(reduced_feature_filename) centroids = kmeans.cluster_centers_.tolist() centroids.insert(0, features) csvtools.save_csv_lines(centroids_filename, centroids) return features
# Load the header info urlIdx = find_column_index_from_csv(headerCsvFilename, urlColumnName) http_method_index = find_column_index_from_csv(headerCsvFilename, http_method_column_name) orgId_index = find_column_index_from_csv(headerCsvFilename, orgIdColumnName) appId_index = find_column_index_from_csv(headerCsvFilename, appIdColumnName) # Get the list of all CSV files fileList = get_csv_filelist_from_folder(folder_with_csvs) # Load the list of url parts url_parts = load_simple_csv_list(url_part_filename) url_parts.append('') # It's helpful to keep empty parts # Load the feature list features = pickletools.load(feature_list_filename) # Create a template for feature counts blank_feature_count_row = [0] * len(features) # Accumulate feature counts here feature_counts = {} # Track if these features are used feature_used = [False] * len(features) def create_empty_feature_count(): # Creates an array of zeroes, sized to the feature counts return blank_feature_count_row.copy()
from settings import * import pickletools import numpy as np # Settings feature_min_connection_count = 1 feature_min_app_count = 1 # Load data features = pickletools.load(feature_list_filename) X_dict = pickletools.load(data_filename) # Remove the labels print('Getting labels and values') labels = list(X_dict.keys()) X = np.array(list(X_dict.values())) useful_features = [] for f in range(0, len(features)): counts_for_this_feature = X[:, f] connection_idxs = list(x for x in counts_for_this_feature if x > 0) connection_count = len(connection_idxs) appIds = list( set( list(labels[connection_idx][0:36] for connection_idx in connection_idxs))) app_count = len(appIds) call_count = sum(counts_for_this_feature) #print('{}/{}/{} - {}'.format(connection_count, app_count, call_count, features[f]))
from settings import * import pickletools import numpy as np # Load the data file #X_dict = pickletools.load(data_filename) X = pickletools.load(reduced_data_filename) # Remove the labels #print('Getting labels and values') #labels = list(X_dict.keys()) #X = np.array(list(X_dict.values())) # Scale it print('Scaling...') #X_scaled = preprocessing.scale(X) X_scaled = np.log(X + 1) #pickletools.save(labels_filename, labels) pickletools.save(normalized_data_filename, X_scaled)
""" Created on Mon Sep 18 16:01:33 2017 @author: nick.green """ from settings import * from sklearn import preprocessing import time import pickletools import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt # Load the data file X_dict = pickletools.load(data_filename) X = np.array(list(X_dict.values())) for feature_idx in range(X.shape[1]): n, bins, patches = plt.hist(X[:,feature_idx], 50, normed=1, facecolor='green', alpha=0.75) plt.draw() plt.savefig(r'plots\feature_hist_{}.png'.format(feature_idx)) plt.show() for idx in range(X.shape[1]): print('{} = {}'.format(idx, sum(X[:,idx])))