Exemple #1
0
def one_time_learning(data, n_clusters, visual_dimensions=2):
    (lambda_matrix, loc, W) = pca(data, dimensions=n_clusters)
    no_means_data = data - np.mean(data, 0)
    #cluster
    location = np.dot(no_means_data, W)
    value = np.max(location, 1)
    cluster_mapping = np.argmax(location, 1)
    W_subgroups = [None for i in range(n_clusters)]

    for i in range(n_clusters):
        indeces = (cluster_mapping == i)
        (_, _, W_sub) = pca(data[indeces, :], visual_dimensions)
        W_subgroups[i] = W_sub.T
    return (W.T, W_subgroups)
Exemple #2
0
def cluster_entries(entries, n_clusters=5):
    """entries is a list of texts.
    It is assumed that all entries belong to the same project and that only remaining (not yet clustered) entries are included
    None will be returned when no (significant) cluster could be found
    Normally it returns:
      - a dictionary with all the words and their indices
      - the indices of the words considered important
      - a binary list indicating which texts belong to the cluster"""

    vectorizer = TfidfVectorizer(min_df=2, use_idf=True, smooth_idf=True)
    vecs = vectorizer.fit_transform(entries)

    tf_idf_matrix = np.asarray(vecs.todense())
    no_means_data = tf_idf_matrix - np.mean(tf_idf_matrix, 0)

    #use library
    #(W, W_subgroups) = one_time_learning(no_means_data, n_clusters, 2)
    #(x, y, cluster_mapping_2) = project_items(no_means_data, W, W_subgroups)

    #do it by hand to see if results are the same
    (singular_values, cluster_location,
     eigenvec) = pca(no_means_data, n_clusters)
    #(W, W_subgroups) = one_time_learning(no_means_data, n_clusters, n_visual_dimensions)
    value = np.max(cluster_location, 1)
    cluster_location = np.dot(eigenvec.T, no_means_data.T).T
    #map items to their corresponding cluster
    cluster_mapping = np.argmax(abs(cluster_location), 1)

    #all clusters are highly significant probably. Nevertheless check it
    #mann-whitney-u test has sometimes an error (returns nan) for unknown reasons (a scaling factor becomes negative)
    #therefore use t-test instead
    means, mannwhitney, ttest, global_mean = best_cluster(
        cluster_mapping, vecs)
    best_cluster_id = np.argsort(means)[0]
    if (ttest[best_cluster_id] > 1E-3):
        #not a significant result. Other clusters could be more significant but this is higly unlikely. So we forget about them
        #return an error
        return None
    """print "global mean: %f" % global_mean
    print "means %s" % str(means)
    print "mean delta %s" % str(global_mean - means)
    print "significance mann-withney-u-test: %s" % str(mannwhitney)
    print "significance t-test: %s" % str(ttest) """

    features = vectorizer.get_feature_names()
    n_words = 20  #number of important words to find

    #find the most important words for every cluster
    important_words = argsort(np.abs(eigenvec.T), 1)[best_cluster_id,
                                                     -n_words:]
    return features, important_words, (best_cluster_id == cluster_mapping)
def cluster_entries(entries, n_clusters=5):
    """entries is a list of texts.
    It is assumed that all entries belong to the same project and that only remaining (not yet clustered) entries are included
    None will be returned when no (significant) cluster could be found
    Normally it returns:
      - a dictionary with all the words and their indices
      - the indices of the words considered important
      - a binary list indicating which texts belong to the cluster"""

    vectorizer = TfidfVectorizer(min_df=2, use_idf=True, smooth_idf=True)
    vecs = vectorizer.fit_transform(entries)

    tf_idf_matrix = np.asarray(vecs.todense())
    no_means_data = tf_idf_matrix - np.mean(tf_idf_matrix, 0)

#use library
#(W, W_subgroups) = one_time_learning(no_means_data, n_clusters, 2)
#(x, y, cluster_mapping_2) = project_items(no_means_data, W, W_subgroups)

    #do it by hand to see if results are the same
    (singular_values, cluster_location, eigenvec) = pca(no_means_data, n_clusters)
    #(W, W_subgroups) = one_time_learning(no_means_data, n_clusters, n_visual_dimensions)
    value = np.max(cluster_location, 1)
    cluster_location = np.dot(eigenvec.T, no_means_data.T).T
    #map items to their corresponding cluster
    cluster_mapping = np.argmax(abs(cluster_location), 1)

    #all clusters are highly significant probably. Nevertheless check it
    #mann-whitney-u test has sometimes an error (returns nan) for unknown reasons (a scaling factor becomes negative)
    #therefore use t-test instead
    means, mannwhitney, ttest, global_mean = best_cluster(cluster_mapping, vecs)
    best_cluster_id = np.argsort(means)[0]
    if (ttest[best_cluster_id] > 1E-3):
        #not a significant result. Other clusters could be more significant but this is higly unlikely. So we forget about them
        #return an error
        return None
    """print "global mean: %f" % global_mean
    print "means %s" % str(means)
    print "mean delta %s" % str(global_mean - means)
    print "significance mann-withney-u-test: %s" % str(mannwhitney)
    print "significance t-test: %s" % str(ttest) """

    features = vectorizer.get_feature_names()
    n_words = 20 #number of important words to find

    #find the most important words for every cluster
    important_words = argsort(np.abs(eigenvec.T), 1)[best_cluster_id, -n_words:]
    return features, important_words, (best_cluster_id == cluster_mapping)
# released under bsd licence
# see LICENSE file or http://www.opensource.org/licenses/bsd-license.php for details
# Institute of Applied Simulation (ZHAW)
# Author Timo Jeranko

from mlscripts.ml.som.file_operations import *
from mlscripts.ml.som.functions import *
import pickle
import scipy
from mlscripts.ml.som.visualize import *
from mlscripts.ml.feature.pca import *
import scipy.sparse as sparse

# read file
matrix = read_file2("input.dat")
matrix = csc_matrix(matrix)

transform = pca(matrix,2)
transform= scipy.array(transform)
 
def printmatrix(matrix):
    for row in matrix:
        last = len(row) - 1
        for i,x in enumerate(row):
            print x,
            if i != last:
                print ";",
        print ""

printmatrix(transform)