Example #1
0
from log_parser import LogParser
from feature_extractor import FeatureExtractor
from clustering import Clustering
from idf import IDF

parser = LogParser('../../data/HDFS_2K.log')
tagged_events, log_sequences = parser.parse()

extractor = FeatureExtractor(log_sequences, list(tagged_events.keys()))
log_sequences = extractor.extract()

clustering = Clustering(log_sequences, tagged_events)
cluster_ids, cluster_values, silhouette = clustering.cluster()

for cluster_value in cluster_values:
    if cluster_value['num_possible_abnormal_events'] != 0:
        print(cluster_value)
        print()

#print ("El coeficiente de silueta es =", silhouette)

    articles = list(
        db.articles.find({}, {
            '_id': 1,
            'keywords': 1
        }).limit(10000))
    article_keywords = np.array([article['keywords'] for article in articles])
    article_object_ids = np.array([article['_id'] for article in articles])

    a_dist = ArticleDistance(copy.deepcopy(article_keywords))
    dists = a_dist.article_pdist()
    n = dists.shape[0]
    dists[range(n), range(n)] = 0
    d = distance.squareform(dists)

    cluster = Clustering(d)
    c = cluster.cluster()
    extractor = TopicExtractor()

    for i in range(len(np.unique(c))):
        articles = []
        indices = np.where(c == i)[0]
        articles.extend(article_object_ids[indices])
        whole_keywords = []
        whole_keywords.extend(article_keywords[indices])
        db.articles.update_many({'_id': {
            '$in': articles
        }}, {'$set': {
            'cluster_id': i
        }})
        topic_words, cluster_word_counts = extractor.extract_topic_keywords(
            whole_keywords)
Example #3
0
    from regressor import Regressor

    regressor = Regressor(algorithm_name)
    y_predicted = regressor.predict(X_train, y_train, X_test)
    regressor_score = regressor.get_score(y_test, y_predicted)

    # Visualizing the results
    visualizer = Visualizer()
    visualizer.plot_classifier_regressor(y_test, y_predicted,
                                         method_identifier)

    print('The coefficient of determination is: ' + str(regressor_score))
    print(algorithm_name)

# ---------------------Clustering the data------------------------------------
elif method_identifier == 3:

    from clustering import Clustering

    clustering = Clustering(algorithm_name)
    n_clusters, inertia = clustering.tune_parameters(X_train)
    clusters = clustering.cluster(X_train, X_test, n_clusters)

    # Visualizing the results
    visualizer = Visualizer()
    visualizer.plot_clustering(X_test, clusters)

    print("The clustering model's inertia: " + str(inertia))
    print(str(algorithm_name))