def cluster(model): ranks, website_names, X = data.getBinnedHistograms(amount=amount, cut=config.cut, big=config.big) N, D = X.shape print "Each feature vector has dimension %d" % D print "Training on %d samples" % N clusters = model.fit_predict(X) assert(len(clusters) == N) websites = [] for i in range(len(clusters)): websites.append((clusters[i], website_names[i])) websites.sort() print websites numClusters = len(set(clusters)) print 'Number of clusters is %d' % numClusters data.plotClusters(websites, amount, clusters=numClusters, xFactor=75, yFactor=25) # Writes kmeans object to pickle to = amount + '-histogram-clusters' pickle_to = '../persist/%s.pkl' % to joblib.dump(model, pickle_to) # Writes clusters to csv with open(to + '.csv', 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) for i in xrange(len(clusters)): writer.writerow([clusters[i], ranks[i], website_names[i]])
def __init__(self, model, amount=config.amount, cluster_type=config.cluster_type, highFactor=.1): self.model = model self.amount = amount self.cluster_type = cluster_type self.highFactor = highFactor self.ranks, self.names, self.histograms = data.getBinnedHistograms(amount, cut=True, big=False)
# from copy import deepcopy # from ml_util import ml from data import data from data import image import config import tester from ml_util import ml from cluster_recommender import ClusterRecommender from random_recommender import RandomRecommender from duckling_recommender import DucklingRecommender import matplotlib.pyplot as plt import matplotlib.image as mpimg import matplotlib.patches as patches amount = config.amount ranks, names, histograms = data.getBinnedHistograms(amount, cut=True, big=False) SAMPLE_SIZE = 10 def show(data, recommender, fractionTrain=0.8, highFactor=0.1, verbose=False, plot=False): """ Parameters: cluster: an array of arrays fractionTrain: a float specifying percent of data to use as train highFactor: ratio of any given element to the largest element in the array Returns: the RMSE produced by removing a specific color, then adding it back Algorithm: Works by first finding the max value in the histogram and then trying to find the index of the histogram that contains the value that is closest (in terms of a ration
import math import csv import matplotlib.image as mpimg from sklearn.externals import joblib # Hack to import ml_util from the parent directory import os, sys sys.path.insert(1, os.path.join(sys.path[0], '..')) from ml_util import ml from ml_util import poly_features from ml_util import simple_plot from data import data amount='top-100' ranks, website_names, X = data.getBinnedHistograms(amount=amount, cut=True, big=True) N, D = X.shape print "Each feature vector has dimension %d" % D print "Training on %d samples" % N numClusters = 20 kmeans = KMeans(n_clusters = numClusters) clusters = kmeans.fit_predict(X) clusters.sort() assert(len(clusters) == N) websites = [] for i in range(len(clusters)): websites.append((clusters[i], website_names[i])) websites.sort() print websites