from log_parser import LogParser from feature_extractor import FeatureExtractor from clustering import Clustering from idf import IDF parser = LogParser('../../data/HDFS_2K.log') tagged_events, log_sequences = parser.parse() extractor = FeatureExtractor(log_sequences, list(tagged_events.keys())) log_sequences = extractor.extract() clustering = Clustering(log_sequences, tagged_events) cluster_ids, cluster_values, silhouette = clustering.cluster() for cluster_value in cluster_values: if cluster_value['num_possible_abnormal_events'] != 0: print(cluster_value) print() #print ("El coeficiente de silueta es =", silhouette)
articles = list( db.articles.find({}, { '_id': 1, 'keywords': 1 }).limit(10000)) article_keywords = np.array([article['keywords'] for article in articles]) article_object_ids = np.array([article['_id'] for article in articles]) a_dist = ArticleDistance(copy.deepcopy(article_keywords)) dists = a_dist.article_pdist() n = dists.shape[0] dists[range(n), range(n)] = 0 d = distance.squareform(dists) cluster = Clustering(d) c = cluster.cluster() extractor = TopicExtractor() for i in range(len(np.unique(c))): articles = [] indices = np.where(c == i)[0] articles.extend(article_object_ids[indices]) whole_keywords = [] whole_keywords.extend(article_keywords[indices]) db.articles.update_many({'_id': { '$in': articles }}, {'$set': { 'cluster_id': i }}) topic_words, cluster_word_counts = extractor.extract_topic_keywords( whole_keywords)
from regressor import Regressor regressor = Regressor(algorithm_name) y_predicted = regressor.predict(X_train, y_train, X_test) regressor_score = regressor.get_score(y_test, y_predicted) # Visualizing the results visualizer = Visualizer() visualizer.plot_classifier_regressor(y_test, y_predicted, method_identifier) print('The coefficient of determination is: ' + str(regressor_score)) print(algorithm_name) # ---------------------Clustering the data------------------------------------ elif method_identifier == 3: from clustering import Clustering clustering = Clustering(algorithm_name) n_clusters, inertia = clustering.tune_parameters(X_train) clusters = clustering.cluster(X_train, X_test, n_clusters) # Visualizing the results visualizer = Visualizer() visualizer.plot_clustering(X_test, clusters) print("The clustering model's inertia: " + str(inertia)) print(str(algorithm_name))