def cluster(model):
    ranks, website_names, X = data.getBinnedHistograms(amount=amount, cut=config.cut, big=config.big)

    N, D = X.shape
    print "Each feature vector has dimension %d" % D
    print "Training on %d samples" % N

    clusters = model.fit_predict(X)
    assert(len(clusters) == N)
    websites = []
    for i in range(len(clusters)):
        websites.append((clusters[i], website_names[i]))
    websites.sort()
    print websites
    numClusters = len(set(clusters))
    print 'Number of clusters is %d' % numClusters

    data.plotClusters(websites, amount, clusters=numClusters, xFactor=75, yFactor=25)

# Writes kmeans object to pickle
    to = amount + '-histogram-clusters'
    pickle_to = '../persist/%s.pkl' % to
    joblib.dump(model, pickle_to)

# Writes clusters to csv
    with open(to + '.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        for i in xrange(len(clusters)):
            writer.writerow([clusters[i], ranks[i], website_names[i]])
 def __init__(self, model, amount=config.amount, cluster_type=config.cluster_type, highFactor=.1):
     self.model = model
     self.amount = amount
     self.cluster_type = cluster_type
     self.highFactor = highFactor
     self.ranks, self.names, self.histograms = data.getBinnedHistograms(amount, cut=True, big=False)
Exemple #3
0
# from copy import deepcopy
# from ml_util import ml
from data import data
from data import image
import config
import tester
from ml_util import ml
from cluster_recommender import ClusterRecommender
from random_recommender import RandomRecommender
from duckling_recommender import DucklingRecommender
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches

amount = config.amount
ranks, names, histograms = data.getBinnedHistograms(amount, cut=True, big=False)
SAMPLE_SIZE = 10


def show(data, recommender, fractionTrain=0.8, highFactor=0.1, verbose=False, plot=False):
    """
    Parameters:
        cluster: an array of arrays
        fractionTrain: a float specifying percent of data to use as train
        highFactor: ratio of any given element to the largest element in the array
    Returns:
        the RMSE produced by removing a specific color, then adding it back

    Algorithm:
        Works by first finding the max value in the histogram and then trying to find the
        index of the histogram that contains the value that is closest (in terms of a ration
Exemple #4
0
import math
import csv
import matplotlib.image as mpimg
from sklearn.externals import joblib

# Hack to import ml_util from the parent directory
import os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from ml_util import ml
from ml_util import poly_features
from ml_util import simple_plot
from data import data


amount='top-100'
ranks, website_names, X = data.getBinnedHistograms(amount=amount, cut=True, big=True)

N, D = X.shape
print "Each feature vector has dimension %d" % D
print "Training on %d samples" % N

numClusters = 20
kmeans = KMeans(n_clusters = numClusters)
clusters = kmeans.fit_predict(X)
clusters.sort()
assert(len(clusters) == N)
websites = []
for i in range(len(clusters)):
    websites.append((clusters[i], website_names[i]))
websites.sort()
print websites