f_vectors = []
train_f_vectors = []
test_f_vectors = []
train_f_hist = []
test_f_hist = []
images = []

#Found elbow at 400-500
N_cluster = 500
#These values are based on various validations and details are mentioned in the Report
SIFT_WEIGHT = 0.90
HIST_WEIGHT = 0.10

#Accuracy and Performance trade off is found out with MiniBatchKmeans algorithm
ms = MiniBatchKMeans(n_clusters=N_cluster,
                     max_no_improvement=3,
                     batch_size=20000)
ms.fit(SIFT_data)
sumimages = 0
del SIFT_data

#*************************************Read Every training Image*********************************
list1 = os.listdir(train_folder)
for filename in list1:
    try:
        img = cv2.imread(train_folder + filename, 0)
        sumimages += 1
        if img is not None:
            sft = cv2.SIFT(300)
            kp, ds = sft.detectAndCompute(img, None)
            if len(ds) > 0:
Example #2
0
def test_minibatch_tol():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                 batch_size=10,
                                 random_state=42,
                                 tol=.01).fit(X)
    _check_fitted_model(mb_k_means)
Example #3
0
}
show(plot_tfidf)

# ## **K-Means Clustering**
#
# K-means clustering obejctive is to minimize the average squared Euclidean distance of the document / description from their cluster centroids.

# In[ ]:

from sklearn.cluster import MiniBatchKMeans

num_clusters = 30  # need to be selected wisely
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
                               init='k-means++',
                               n_init=1,
                               init_size=1000,
                               batch_size=1000,
                               verbose=0,
                               max_iter=1000)

# In[ ]:

kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

# In[ ]:

sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

# #############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
Example #5
0
def test_mb_k_means_plus_plus_init_sparse_matrix():
    mb_k_means = MiniBatchKMeans(init="k-means++",
                                 n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X_csr)
    _check_fitted_model(mb_k_means)
Example #6
0
def test_minibatch_k_means_perfect_init_dense_array():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 k=n_clusters,
                                 random_state=42).fit(X)
    _check_fitted_model(mb_k_means)
Example #7
0
def test_minibatch_default_init_size():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 k=n_clusters,
                                 random_state=42).fit(X)
    assert_equal(mb_k_means.init_size, 3 * mb_k_means.batch_size)
    _check_fitted_model(mb_k_means)
Example #8
0
#centers = [[50, 100], [100, 200], [50, 50],[150, 150]]
centers = np.random.randint(size=(k, 2), low=20, high=200)

n_clusters = len(centers)

k_means = KMeans(init='k-means++', n_clusters=n_clusters, verbose=True)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

mbk = MiniBatchKMeans(init='k-means++',
                      n_clusters=n_clusters,
                      batch_size=batch_size,
                      verbose=True)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

# Plot result

fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
Example #9
0
def run(argv):
    dir_path = argv[1]
    outfile = argv[2]

    logging.basicConfig(format='%(levelname)s : %(message)s',
                        level=logging.INFO)

    print('Loading title_StackOverflow.txt ... ')
    sentences = []
    with open(dir_path + 'title_StackOverflow.txt', 'r') as f:
        for line in f:
            line = line.translate(replace_punctuation)
            line = line.decode('utf-8').encode('ascii',
                                               'ignore').lower().split()
            words = [word for word in line if word not in stoplist]
            sentences.append(' '.join(words))

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    vectorizer = TfidfVectorizer(
        max_df=0.5,
        min_df=2,
        #stop_words=stoplist,
        use_idf=True,
        sublinear_tf=True)
    X = vectorizer.fit_transform(sentences)

    print("n_samples: %d, n_features: %d" % X.shape)

    print("Performing dimensionality reduction using LSA")
    svd = TruncatedSVD(20)  #TODO
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    explained_variance = svd.explained_variance_ratio_.sum()
    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print("KMeans Clustering ... ")
    MINI = True
    true_k = 35  #TODO
    if MINI:
        km = MiniBatchKMeans(n_clusters=true_k,
                             init='k-means++',
                             n_init=1,
                             init_size=1000,
                             batch_size=1000,
                             verbose=1)
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1,
                    verbose=1)
    km.fit(X)

    print("Top terms per cluster:")
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

    labels = km.labels_

    predictions = np.zeros(5000000)
    with open(dir_path + 'check_index.csv') as f:
        incsv = csv.reader(f)
        next(incsv)
        for row in incsv:
            if labels[int(row[1])] == labels[int(row[2])]:
                predictions[int(row[0])] = 1

    with open(outfile, 'wb') as f:
        f.write('ID,Ans\n')
        for i in xrange(predictions.shape[0]):
            f.write('%d,%d\n' % (i, predictions[i]))
Example #10
0
               decomposition.MiniBatchSparsePCA(n_components=n_components,
                                                alpha=0.8,
                                                n_iter=100,
                                                chunk_size=3,
                                                random_state=rng), True),
              ('MiniBatchDictionaryLearning',
               decomposition.MiniBatchDictionaryLearning(n_atoms=15,
                                                         alpha=0.1,
                                                         n_iter=50,
                                                         chunk_size=3,
                                                         random_state=rng),
               True),
              ('Cluster centers - MiniBatchKMeans',
               MiniBatchKMeans(n_clusters=n_components,
                               tol=1e-3,
                               batch_size=20,
                               max_iter=50,
                               random_state=rng), True)]

###############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

###############################################################################
# Do the estimation and plot it

for name, estimator, center in estimators:
    print "Extracting the top %d %s..." % (n_components, name)
    t0 = time()
    data = faces
Example #11
0
    # Measure runtime
    start_time = time.time()
    n_colors = int(sys.argv[1])

    # Reads the image in BGR format
    img = cv2.imread(pic_path)
    # BGR->RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Normalize the colors so that they are between 0&1 and reshape the data for a typical
    # scikit-learn input
    X = (img / 255.0).reshape(-1, 3)

    # KMeans model
    model = MiniBatchKMeans(n_clusters=n_colors)
    classf = model.fit_predict(X)

    # The new colors are the centers of the clusters
    # Basically we create a new image where the true input color is replaced by the color of
    # the closest cluster
    new_colors = model.cluster_centers_
    new_image = new_colors[classf].reshape(img.shape)
    new_image = (new_image * 255).astype(np.uint8)

    # Save the new image
    plot = plt.imshow(new_image)

    # Remove axes and whitespaces sorrounding the image
    plt.axis('off')
    plt.savefig("compressed_" + pic_path.split("/")[-1], bbox_inches=0)
        if birch_model.n_clusters is None:
            ax.scatter(this_centroid[0],
                       this_centroid[1],
                       marker='+',
                       c='k',
                       s=25)
    ax.set_ylim([-25, 25])
    ax.set_xlim([-25, 25])
    ax.set_autoscaley_on(False)
    ax.set_title('Birch %s' % info)

# Compute clustering with MiniBatchKMeans.
mbk = MiniBatchKMeans(init='k-means++',
                      n_clusters=100,
                      batch_size=100,
                      n_init=10,
                      max_no_improvement=10,
                      verbose=0,
                      random_state=0)
t0 = time()
mbk.fit(X)
t_mini_batch = time() - t0
print("Time taken to run MiniBatchKMeans %0.2f seconds" % t_mini_batch)
mbk_means_labels_unique = np.unique(mbk.labels_)

ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters),
                                 colors_):
    mask = mbk.labels_ == k
    ax.scatter(X[mask, 0],
               X[mask, 1],
print 'done in', time.time() - t, 'seconds'

dm.save_pipeline_result(features_rotated, 'featuresRotated', 'npy')


try:
    centroids = dm.load_pipeline_result('centroids', 'npy', section=100)
    n_texton = len(centroids)
except:
    t = time.time()
    print 'quantize feature vectors ...',

    n_texton = 100

    from sklearn.cluster import MiniBatchKMeans
    kmeans = MiniBatchKMeans(n_clusters=n_texton, batch_size=1000)
    kmeans.fit(features_rotated[::10])
    centroids = kmeans.cluster_centers_

    cluster_assignments = fclusterdata(centroids, 1.15, method="complete", criterion="inconsistent")

    centroids = np.array([centroids[cluster_assignments == i].mean(axis=0) for i in set(cluster_assignments)])

    n_texton = len(centroids)
    print n_texton, 'reduced textons'

    print 'done in', time.time() - t, 'seconds'

    del kmeans

Example #14
0
from gensim.models import doc2vec, word2vec
from collections import namedtuple
import pandas as pd
import numpy as np

train = pd.read_csv("data/train.csv")
train_data = np.append(train.Context.values, train.Utterance.values)

taggedMessage = namedtuple('TaggedMessage', 'words tags')
documents = []

# Preprocess messages
for i, message in enumerate(train_data):
    # Split into lists of words
    words = message.split()
    tags = [i]
    x = taggedMessage(words, tags)
    documents.append(taggedMessage(words, tags))

d2v2 = doc2vec.Doc2Vec(documents, size=200, workers=4, iter=20)

X = []
for v in d2v2.docvecs:
    X.append(v)

X = np.array(X)

# Cluster messages using k-means
kmeans = MiniBatchKMeans().fit(X)
print 'successfully clustered'
Example #15
0
def cluster_2_vec(
    dataset_df,
    id_df,
    downsample_ratio,
    model_name,
    k_means_method='k-means++',
    num_cluster=20,
):
    """
    :param dataset_df:
    :param id_df:
    :param k_means_method: k-means++ or random
    :param num_cluster:
    :return:
    """
    from sklearn.cluster import KMeans, MiniBatchKMeans

    dataset_x = np.asarray(dataset_df.ix[:, 1:len(dataset_df.columns) -
                                         1].as_matrix(),
                           dtype='float32')
    dataset_x = normalize(dataset_x, axis=1)

    print '...... kmeans ......'
    mbk = MiniBatchKMeans(init=k_means_method,
                          n_clusters=num_cluster,
                          batch_size=1000,
                          n_init=10,
                          max_no_improvement=10,
                          verbose=0)

    mbk.fit(dataset_x)

    dataset_df['cluster'] = mbk.labels_

    print '...... to feature vectors ......'

    train_person_ids = set(id_df['个人编码'])

    feature_vecs = []
    feature_labels = []

    for person_id in tqdm(train_person_ids):
        person_pd = dataset_df[dataset_df['个人编码'] == person_id]
        person_vec = np.zeros(num_cluster)
        person_label = np.asarray(person_pd.iloc[0][len(person_pd.columns) -
                                                    2],
                                  dtype='float32')

        # down sampling
        if int(person_label) == 0:
            if np.random.rand() <= downsample_ratio:
                for c in range(num_cluster):
                    person_vec[c] = np.sum(person_pd['cluster'] == c)

                feature_vecs.append(person_vec)
                feature_labels.append(_one_hot_encoder(int(person_label)))
        else:
            for c in range(num_cluster):
                person_vec[c] = np.sum(person_pd['cluster'] == c)

            feature_vecs.append(person_vec)
            feature_labels.append(_one_hot_encoder(int(person_label)))

    _shuffle_data(feature_vecs, feature_labels)

    feature_vecs = np.asarray(feature_vecs, dtype='float32')
    feature_labels = np.asarray(feature_labels, dtype='float32')

    print 'postive sample:%d, negative sample:%d' % (np.sum(
        feature_labels[:, 1] == 1), np.sum(feature_labels[:, 0] == 1))
    return feature_vecs, feature_labels, mbk
Example #16
0
def qmrf_regions(data,
                 edges,
                 nbow=20,
                 lamda=1,
                 sampling='random',
                 nsamples=10000,
                 label_potential='l1',
                 unary_sq=True,
                 online=True,
                 gamma=None,
                 max_iter=5,
                 truncated=False,
                 rng=42,
                 verbose=True,
                 return_centers=False,
                 return_edge_costs=True):
    with Timer('Colors'):
        if nbow == 'birch':
            clf = Birch(threshold=0.8, branching_factor=100)
        elif online:
            clf = MiniBatchKMeans(n_clusters=nbow,
                                  verbose=verbose,
                                  random_state=rng,
                                  batch_size=100,
                                  max_iter=100,
                                  max_no_improvement=10)
        else:
            clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng)

        if nsamples is None:
            dist = clf.fit_transform(data)
        else:
            if sampling == 'random':
                idx = np.random.choice(data.shape[0], nsamples, replace=False)
            else:
                n = np.sqrt(nsamples)
                ratio = image.shape[0] / float(image.shape[1])
                ny = int(n * ratio)
                nx = int(n / ratio)
                y = np.linspace(0, image.shape[0], ny,
                                endpoint=False) + (image.shape[0] // ny // 2)
                x = np.linspace(0, image.shape[1], nx,
                                endpoint=False) + (image.shape[1] // nx // 2)
                xx, yy = np.meshgrid(x, y)
                idx = np.round(yy * image.shape[1] + xx).astype(int).flatten()
            clf.fit(data[idx])
            dist = clf.transform(data)

        if nbow == 'birch':
            centers = clf.subcluster_centers_
        else:
            centers = clf.cluster_centers_

    with Timer('Unary'):
        K = centers.shape[0]

        if label_potential == 'color':
            unary_cost = np.zeros((data.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                unary_cost[:, i] = colordiff(data, centers[i:i + 1])
        else:
            unary_cost = dist.astype(np.float32)

        if unary_sq:
            unary_cost **= 2

    with Timer('Pairwise'):
        if label_potential == 'l1':
            label_cost = np.abs(centers[:, None, :] -
                                centers[None, ...]).sum(-1)
        elif label_potential == 'l2':
            label_cost = np.sqrt(
                ((centers[:, None, :] - centers[None, ...])**2).sum(-1))
        elif label_potential == 'potts':
            label_cost = np.ones((K, K), int) - np.eye(K, dtype=int)
        elif label_potential == 'color':
            label_cost = np.zeros((centers.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                label_cost[:, i] = colordiff(centers, centers[i:i + 1])
        if truncated:
            label_cost = np.maximum(1, label_cost)
        label_cost = (label_cost * lamda).astype(np.float32)

    if verbose:
        print("=================")
        print("Minimizing graph:")
        print("Nodes: %d, edges: %d, labels: %d" % \
              (unary_cost.shape[0], edges.shape[0], label_cost.shape[0]))
        print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \
              (unary_sq, label_potential, (gamma is not None)))
        print("#################")

    with Timer('Edge Cost'):
        diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1)
        if gamma is not None and type(gamma) in [int, float]:
            edge_costs = np.exp(-gamma * diff).astype(np.float32)
        elif gamma == 'auto':
            edge_costs = np.exp(-diff.mean() * diff).astype(np.float32)
        elif gamma == 'color':
            edge_costs = 1. / (1. +
                               colordiff(data[edges[:, 0]], data[edges[:, 1]]))
            edge_costs = edge_costs.astype(np.float32)
        else:
            edge_costs = np.ones(edges.shape[0], dtype=np.float32)

    with Timer('Minimize'):
        if label_cost.shape[0] == 2:
            labels = solve_binary(edges, unary_cost, edge_costs, label_cost)
        else:
            labels = solve_aexpansion(edges, unary_cost, edge_costs,
                                      label_cost)

    if return_centers:
        return labels, label_cost, centers

    return labels, label_cost
Example #17
0
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html
from sklearn.cluster import MiniBatchKMeans
#kmeans = MiniBatchKMeans(n_clusters=2,
#                         random_state=0,
#                         batch_size=100,
#                         max_iter=10).fit(X)
#print(kmeans.cluster_centers_)
#print(kmeans.labels_)
#print(kmeans.inertia_)

filename = "mm.data"
x_mm = np.memmap(filename, dtype='float32', mode='write', shape=x_train.shape)
x_mm[:] = x_train

minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10, random_state=42, verbose=2)
minibatch_kmeans.fit(x_mm)

#n_components = 2
#ipca = IncrementalPCA(n_components=n_components, batch_size=10)
#x_train_ipca = ipca.fit_transform(x_train)
#
#pca = PCA(n_components=n_components)
#x_trai_pca = pca.fit_transform(X)
#
#colors = ['navy', 'turquoise', 'darkorange']
#
#for x_train_transformed, title in [(x_train_ipca, "Incremental PCA"), (x_train_pca, "PCA")]:
#    plt.figure(figsize=(8, 8))
#    for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
#        plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
Example #18
0
    )
    
    
    #plot_PCA(anomaly[numerical_features], anomaly.KMcluster, chunk_labels)
    
    #print (total_FW)
    
    return (outputDF)
    
    


# In[46]:


kmodel = MiniBatchKMeans(n_clusters=4,random_state=10, init='k-means++')

result = run_train(
    total,
    eps_=1,
    min_samples_=4,
    E=3.0, 
    kmodel=kmodel,
    chunk_size=500,
    tau1=0.5,
    tau2=1.0,
    rd_threshold=0.5,
    alpha=0.5,
    fw=True                         
)
Example #19
0
def test_sparse_mb_k_means_callable_init():
    def test_init(X, k, random_state):
        return centers

    mb_k_means = MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)
        # Create a Zeek log reader
        print('Opening Data File: {:s}'.format(args.zeek_log))
        reader = zeek_log_reader.ZeekLogReader(args.zeek_log, tail=True)

        # Create a Zeek IDS log live simulator
        print('Opening Data File: {:s}'.format(args.zeek_log))
        reader = live_simulator.LiveSimulator(args.zeek_log,
                                              eps=10)  # 10 events per second

        # Create a Dataframe Cache
        df_cache = dataframe_cache.DataFrameCache(
            max_cache_time=600)  # 10 minute cache

        # Streaming Clustering Class
        batch_kmeans = MiniBatchKMeans(n_clusters=5, verbose=True)

        # Use the ZeekThon DataframeToMatrix class
        to_matrix = dataframe_to_matrix.DataFrameToMatrix()

        # Add each new row into the cache
        time_delta = 10
        timer = time.time() + time_delta
        FIRST_TIME = True
        for row in reader.readrows():
            df_cache.add_row(row)

            # Every 30 seconds grab the dataframe from the cache
            if time.time() > timer:
                timer = time.time() + time_delta
Example #21
0
def main():
    im=Image.open("a.jpg");
    
    pix=im.load();
    print im.size
    
    
    w=im.size[0];
    h=im.size[1];
    
    lst = [];
    k=0;
    for i in range(0,w):
        for j in range(0,h):
            lst.append(pix[i,j]);
            k=k+1;
            #print pix[i,j]," ";
        #print "\n"
    
    
    print lst
    
    n=len(lst)
    #print len(lst)
    #d={};
    
    
    arr = np.ones(n);
    
    
    arr_list = np.array(arr).tolist()
    
    print arr_list;
    
    #pixels = np.array(pix).tolist()
    
    #print pixels;
    
    temp = collections.defaultdict(list)
    for p, ele in zip(lst, arr_list):
        temp[p].append(ele)
            
            
    print temp;
    
    
    
    d={}
    s=0;
    for key,value in temp.iteritems():
        s=add_value(value)
        d[key]=s;
        
        
    print d;
    
    
    
    
    d_list=[]    
    for key, value in d.iteritems():                
        tmp = [key,value]                
        d_list.append(tmp)
        
    d_pixel_list=[];
    
    for key in d.iteritems():
        temp2=[key];
        d_pixel_list.append(temp2);
        
    b=np.asarray(d_pixel_list);
    
    c=np.asarray(lst);
            
            
            
            #print 'd1_list'
            #print d1_list
        
    a=np.asarray(d_list);
            #print 'array'
            #print             

    print 'a';

    print a;
    
    print 'b';    
    
    print b;
    
    print 'c';
    
    print c;
    #np.savetxt("C:\Users\Ashish Yadav\Desktop\ALL\ML\ML_Project\File.txt",a,delimiter=",",fmt='%s');

    
    
    cluster = MiniBatchKMeans(n_clusters=10);
    y = cluster.fit_predict(c);
    
    print 'c_new';
    
    print c;
    
    print 'y';
    
    print y;
    
    
    #print 'cluster';
    
    #print cluster;
        

    np.savetxt("C:\Users\Ashish Yadav\Desktop\ALL\ML\ML_Project\Cluster.txt",y,delimiter=",",fmt='%s');    
Example #22
0
 def one_kmeans(self):
     self.kmeans = MiniBatchKMeans(n_clusters=self.n_clusters,
                                   batch_size=self.batch_size,
                                   verbose=True).fit(self.data)
Example #23
0
def test_mb_k_means_plus_plus_init_dense_array():
    mb_k_means = MiniBatchKMeans(init="k-means++",
                                 n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X)
    _check_fitted_model(mb_k_means)
Example #24
0
    print path

    for video in os.listdir(path):
        if video == '.DS_Store':
            continue

        print label + ": " + video

        videoPath = path + "/" + video
        videoFeatures = util.readVideoData(videoPath, 100)
        #stackOfAllFeatures.append(videoFeatures)
        print "VideoFeatures array dims:" + str(
            videoFeatures.shape)  #Steve add: dimensions of array
        print "StackOfAll array dims: " + str(
            stackOfAllFeatures.shape)  #Steve add: dimensions of array
        stackOfAllFeatures = np.vstack((stackOfAllFeatures, videoFeatures))

temp = stackOfAllFeatures[1:]
# Perform K-Means: 2500 centroids
# kmeans = MiniBatchKMeans(init="k-means++", n_clusters=2500, n_init=10, verbose=0)
kmeans = MiniBatchKMeans(init="k-means++",
                         n_clusters=2500,
                         n_init=10,
                         verbose=1)  # Steve: set it verbose
kmeans.fit(temp)
vocabulary = kmeans.cluster_centers_

# Save vocabulary
file = open("Data/voc.pkl", "w")
pickle.dump(vocabulary, file)
Example #25
0
def test_minibatch_k_means_perfect_init_sparse_csr():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)
Example #26
0
        exit(1)

    mfcc_csv_file = sys.argv[1]
    output_folder = sys.argv[2]

    # Set a fixed random seed so we can reproduce the results
    np.random.seed(11775)
    print "Loading MFCC CSV file"
    data = np.loadtxt(mfcc_csv_file, delimiter=";")
    print "data.shape: " + str(data.shape)

    for cluster_num in range(50, 501, 50):
        print "cluster_num: " + str(cluster_num)
        # kmeans = KMeans(n_clusters=cluster_num, init="k-means++", n_init=10)
        kmeans = MiniBatchKMeans(n_clusters=cluster_num,
                                 batch_size=10000,
                                 init="k-means++",
                                 n_init=3)
        kmeans.fit(data)
        # print kmeans
        print "K-means inertia: " + str(kmeans.inertia_) + "\n"
        # print "k_means.cluster_centers_.shape: " + str(kmeans.cluster_centers_.shape)

        # Output the K-means model
        output_filename = output_folder + "/kmeans." + str(
            cluster_num) + ".model"
        output_file = open(output_filename, "wb")
        cPickle.dump(kmeans, output_file)
        output_file.close()

    print "K-means model trained and output successfully!"
Example #27
0
def test_mini_match_k_means_invalid_init():
    km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters)
    assert_raises(ValueError, km.fit, X)
Example #28
0
# This simple script trains a Kmeans model on a collection of csv that should contain, row by row, the vectors
# that correspond to the sift features extracted from an image. This will then be used to find the "k" most
# representative features, which will then be used to categorize every feature extracted from a certain image.
# This in turn will allow the creation of histogram where the relative occurence of each of these features will be
# represented by the value of each bin in the histogram.

import dask.dataframe as dd
import joblib
from sklearn.cluster import MiniBatchKMeans

data_path = 'data'

# Number of clusters, thus representing the number of visual words that are of interest.
k = 500

model_name = 'new_KMeans_' + str(k) + '.pkl'

print(model_name)

# Use dask to lazily load all the data at once, and thus avoid overloading the memory.
df = dd.read_csv(data_path + '/*/*.csv')

# Create and train the model, using dasks lazy loading of data.
model = MiniBatchKMeans(n_clusters=k, verbose=True)
model.fit(df.values)

# Export the trained model, so it can be used in a later stage.
print("exporting")
joblib.dump(model, model_name)
print("done")
Example #29
0
    # print(labels)
    # hueValuePlot.plot_hue_and_value(list_of_hue, list_of_value, list_of_rgb, list_of_saturation, filename, labels)
    #
    f = []
    for (dirpath, dirnames, filenames) in walk('images'):
        f.extend(filenames)
        break

    counter = 0
    for each in f:
        # name = each
        print("processing image", counter)
        counter += 1

        filename = each
        img = Image.open('images/'+filename).convert('RGB')
        c_rgb = img.getcolors()
        c_rgb = hueValuePlot.create_dict_of_color_list(c_rgb)
        img = img.convert('HSV')
        c = img.getcolors()
        list_of_hue, list_of_value, list_of_rgb, list_of_saturation = hueValuePlot.get_hue_value_from_list(c, c_rgb)
        h_list = np.array(list_of_hue)
        s_list = np.array(list_of_saturation)
        v_list = np.array(list_of_value)
        training_array = np.dstack((h_list, s_list, v_list))
        training_array = training_array.reshape(-1, 3)
        # print(training_array.shape, training_array)
        clt = MiniBatchKMeans(n_clusters=5)
        labels = clt.fit_predict(training_array)
        # print(labels)
        hueValuePlot.plot_hue_and_value(list_of_hue, list_of_value, list_of_rgb, list_of_saturation, filename, labels)
def run(train_pyramid_descriptors, D, test_pyramid_descriptors,
        feat_des_options):

    train_images_filenames = cPickle.load(
        open('train_images_filenames.dat', 'rb'))
    test_images_filenames = cPickle.load(
        open('test_images_filenames.dat', 'rb'))
    train_labels = cPickle.load(open('train_labels.dat', 'rb'))
    test_labels = cPickle.load(open('test_labels.dat', 'rb'))

    k = feat_des_options['k']
    codebook = MiniBatchKMeans(n_clusters=k,
                               verbose=False,
                               batch_size=k * 20,
                               compute_labels=False,
                               reassignment_ratio=10**-4,
                               random_state=42)
    codebook.fit(D)

    visual_words_pyramid = np.zeros((len(train_pyramid_descriptors),
                                     k * len(train_pyramid_descriptors[0])),
                                    dtype=np.float32)
    for i in range(len(train_pyramid_descriptors)):
        visual_words_pyramid[i, :] = spatial_pyramid_histograms(
            train_pyramid_descriptors[i], codebook, k)

    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knn.fit(visual_words_pyramid, train_labels)

    # logreg = LogisticRegression(random_state=0,max_iter=300).fit(visual_words_pyramid, train_labels)
    # scores = cross_validate(logreg, visual_words_pyramid, train_labels,scoring = ['precision_macro', 'recall_macro','f1_macro'], cv=5,return_estimator=True)

    scores = cross_validate(
        knn,
        visual_words_pyramid,
        train_labels,
        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
        cv=8,
        return_estimator=True)
    cross_val_accuracy = scores['test_accuracy'].mean()
    cross_val_precision = scores['test_precision_macro'].mean()
    cross_val_recall = scores['test_recall_macro'].mean()
    cross_val_f1 = scores['test_f1_macro'].mean()
    # print("%0.2f precision with a std dev of %0.2f" % (cross_val_precision, scores['test_precision_macro'].std()))
    # print("%0.2f recall with a std dev of %0.2f" % (cross_val_recall, scores['test_recall_macro'].std()))
    # print("%0.2f F1-score with a std dev of %0.2f" % (cross_val_f1, scores['test_f1_macro'].std()))

    visual_words_test = np.zeros(
        (len(test_images_filenames), visual_words_pyramid.shape[1]),
        dtype=np.float32)
    for i in range(len(test_images_filenames)):
        visual_words_test[i, :] = spatial_pyramid_histograms(
            test_pyramid_descriptors[i], codebook, k)

    test_accuracy = 100 * knn.score(visual_words_test, test_labels)
    # print("Test accuracy: %0.2f" % (test_accuracy))

    test_prediction = knn.predict(visual_words_test)
    # test_prediction = logreg.predict(visual_words_test)
    test_precision, test_recall, test_fscore, _ = precision_recall_fscore_support(
        test_labels, test_prediction, average='macro')
    # print("%0.2f precision" % (test_precision))
    # print("%0.2f recall" % (test_recall))
    # print("%0.2f F1-score" % (test_fscore))

    # pca = PCA(n_components=64)
    pca = PCA(n_components=feat_des_options['pca_perc'], svd_solver='full')
    VWpca = pca.fit_transform(visual_words_pyramid)
    knnpca = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knnpca.fit(VWpca, train_labels)
    vwtestpca = pca.transform(visual_words_test)
    pca_test_accuracy = 100 * knnpca.score(vwtestpca, test_labels)
    # print("PCA Test accuracy: %0.2f" % (pca_test_accuracy))
    scores_pca = cross_validate(
        knnpca,
        visual_words_pyramid,
        train_labels,
        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
        cv=8,
        return_estimator=True)
    cross_val_accuracy_pca = scores_pca['test_accuracy'].mean()
    cross_val_precision_pca = scores_pca['test_precision_macro'].mean()
    cross_val_recall_pca = scores_pca['test_recall_macro'].mean()
    cross_val_f1_pca = scores_pca['test_f1_macro'].mean()

    lda = LinearDiscriminantAnalysis(n_components=7)
    VWlda = lda.fit_transform(visual_words_pyramid, train_labels)
    knnlda = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knnlda.fit(VWlda, train_labels)
    vwtestlda = lda.transform(visual_words_test)
    lda_test_accuracy = 100 * knnlda.score(vwtestlda, test_labels)
    # print("LDA Test accuracy: %0.2f" % (lda_test_accuracy))

    return [
        cross_val_accuracy, cross_val_precision, cross_val_recall,
        cross_val_f1, test_precision, test_recall, test_fscore, test_accuracy,
        pca_test_accuracy, cross_val_accuracy_pca, cross_val_precision_pca,
        cross_val_recall_pca, cross_val_f1_pca, lda_test_accuracy
    ]