Ejemplo n.º 1
0
def cluster(model):
    ranks, website_names, X = data.getBinnedHistograms(amount=amount, cut=config.cut, big=config.big)

    N, D = X.shape
    print "Each feature vector has dimension %d" % D
    print "Training on %d samples" % N

    clusters = model.fit_predict(X)
    assert(len(clusters) == N)
    websites = []
    for i in range(len(clusters)):
        websites.append((clusters[i], website_names[i]))
    websites.sort()
    print websites
    numClusters = len(set(clusters))
    print 'Number of clusters is %d' % numClusters

    data.plotClusters(websites, amount, clusters=numClusters, xFactor=75, yFactor=25)

# Writes kmeans object to pickle
    to = amount + '-histogram-clusters'
    pickle_to = '../persist/%s.pkl' % to
    joblib.dump(model, pickle_to)

# Writes clusters to csv
    with open(to + '.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        for i in xrange(len(clusters)):
            writer.writerow([clusters[i], ranks[i], website_names[i]])