def display_vector_index_details(vector_index, vectors, vector_names, vectorizer): # borrow the feature -> string method from get_common_features_from_cluster output = "" upf, unf, shared_features = get_common_features_from_cluster(vectors[vector_index, :].reshape(1, -1), np.array([0]), vectorizer) ip = vector_names[vector_index] features = set() for feature in reduce_shared_features(shared_features[0]['positive']): features.add(" - ".join([i.encode("unicode-escape") for i in feature])) features = sorted(list(features)) output += "IP: {0}\n".format(ip) for f in features: output += f + "\n" return output + "\n"
def display_shared_vector_indeces_details(vector_indeces, vectors, vector_names, vectorizer): # borrow the feature -> string method from get_common_features_from_cluster output = "" upf, unf, shared_features = get_common_features_from_cluster(vectors[vector_indeces, :], np.array([0] * len(vector_indeces)), vectorizer) features = set() for feature in reduce_shared_features(shared_features[0]['positive']): features.add(" - ".join([i.encode("unicode-escape") for i in feature])) features = sorted(list(features)) if len(features) == 0: return "No shared features\n" for f in features: output += f + "\n" return output + "\n"
def display_vector_index_details(vector_index, vectors, vector_names, vectorizer): # borrow the feature -> string method from get_common_features_from_cluster output = "" upf, unf, shared_features = get_common_features_from_cluster( vectors[vector_index, :].reshape(1, -1), np.array([0]), vectorizer) ip = vector_names[vector_index] features = set() for feature in reduce_shared_features(shared_features[0]['positive']): features.add(" - ".join([i.encode("unicode-escape") for i in feature])) features = sorted(list(features)) output += "IP: {0}\n".format(ip) for f in features: output += f + "\n" return output + "\n"
def display_shared_vector_indeces_details(vector_indeces, vectors, vector_names, vectorizer): # borrow the feature -> string method from get_common_features_from_cluster output = "" upf, unf, shared_features = get_common_features_from_cluster( vectors[vector_indeces, :], np.array([0] * len(vector_indeces)), vectorizer) features = set() for feature in reduce_shared_features(shared_features[0]['positive']): features.add(" - ".join([i.encode("unicode-escape") for i in feature])) features = sorted(list(features)) if len(features) == 0: return "No shared features\n" for f in features: output += f + "\n" return output + "\n"
normalized_vectors = normalize(vectors) reduced_vectors = pca(vectors) # Cluster the vectors logging.info("Clustering") labels = cluster(vector_names, vectors, reduced_vectors, normalized_vectors, vectorizer, args.strategy, args.method, args.n_clusters, args.epsilon, args.min_samples, args.metric) logging.info("Clustering Complete") # Test cluster validity overall_score, per_cluster_score = validate_clusters(vectors, labels) # Analysis relevant to the person reading results universal_positive_features, universal_negative_features, shared_features = get_common_features_from_cluster( vectors, labels, vectorizer) # Reduce results and relevant information to per cluster data cluster_details = {} for cluster_id in per_cluster_score.keys(): cluster_details[cluster_id] = { "silhouette": per_cluster_score[cluster_id], "shared_positive_features": shared_features[cluster_id]['positive'], #"shared_negative_features": shared_features[cluster_id]['negative'], "ips": [ vector_names[x] for x in xrange(len(vector_names)) if labels[x] == cluster_id ] }
logging.info("Vectorizing complete") logging.info("Reducing vector dimensions with PCA") normalized_vectors = normalize(vectors) reduced_vectors = pca(vectors) # Cluster the vectors logging.info("Clustering") labels = cluster(vector_names, vectors, reduced_vectors, normalized_vectors, vectorizer, args.strategy, args.method, args.n_clusters, args.epsilon, args.min_samples, args.metric) logging.info("Clustering Complete") # Test cluster validity overall_score, per_cluster_score = validate_clusters(vectors, labels) # Analysis relevant to the person reading results universal_positive_features, universal_negative_features, shared_features = get_common_features_from_cluster(vectors, labels, vectorizer) # Reduce results and relevant information to per cluster data cluster_details = {} for cluster_id in per_cluster_score.keys(): cluster_details[cluster_id] = { "silhouette": per_cluster_score[cluster_id], "shared_positive_features": shared_features[cluster_id]['positive'], #"shared_negative_features": shared_features[cluster_id]['negative'], "ips": [vector_names[x] for x in xrange(len(vector_names)) if labels[x] == cluster_id] } print_cluster_details(cluster_details, shared_features) if args.plot: create_plot(reduced_vectors, labels, vector_names)