def km(tx, ty, rx, ry, add="", times=10):
    print "km"
    #this does the exact same thing as the above
    clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 50, 88] # eight for num speakers, eleven for num vowels
    for num_c in clusters:
        add += "nc" + str(num_c)
        errs = []

        # so we do this a bunch of times
        for i in range(2,times):
            clusters = {x:[] for x in range(i)}
            clf = KM(n_clusters=i)
            clf.fit(tx)  #fit it to our data
            test = clf.predict(tx)
            result = clf.predict(rx)  # and test it on the testing set
            for index, val in enumerate(result):
                clusters[val].append(index)
            mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
            processed = [mapper[val] for val in result]
            sqrd_err = [(processed[n]-ty[n])**2 for n in range(len(processed))]
            errs.append(sum() / float(len(ry)))
        plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add)

        td = np.reshape(test, (test.size, 1))
        rd = np.reshape(result, (result.size, 1))
        newtx = np.append(tx, td, 1)
        newrx = np.append(rx, rd, 1)
        nn(newtx, ty, newrx, ry, add="onKM"+add)
    print "km done" + add
def k_means(testX, goodSample,
            data=None, train=False, plot=False):
    if train==True:
        n_clusters = 3
        est = KMeans(n_clusters)
        est.fit(data)
        centers = est.cluster_centers_
        utils.pickle(est, 'SrcTeam/capsuleData/capsule_k_means')
    else:
        est = utils.unpickle('SrcTeam/capsuleData/capsule_k_means')

    numMatch = 0.0
    numGood = goodSample.shape[0]

    #sampleLabel = clusterLabel(centers, sample)
    testLabel = est.predict(testX)
    for i in range(numGood):
        if est.predict(goodSample[i,:]) == testLabel:
            numMatch += 1

    if plot==True:
        fig = pl.figure()
        pl.clf()
        ax = Axes3D(fig)
        labels = est.labels_
        ax.scatter(data[:,0],data[:,1],data[:,2],c=labels.astype(np.float))
        pl.show()

    return float(numMatch) / numGood
Beispiel #3
0
def cluster_and_learn_nn(train_data, train_target, test_data, test_target,):
    # get cluster assignments for training and test data
    # 2 was the best k per earlier experiments
    km = KMeans(n_clusters=2, random_state=1).fit(train_data)
    train_clusters = km.predict(train_data)
    test_clusters = km.predict(test_data)
    # add the cluster assignment as a feature
    train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1)
    test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1)

    print('KMeans cluster NN')
    learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)

    # repeat with EM
    # 4 = best c per earlier experiments
    em = GMM(n_components=4, random_state=1)
    em.fit(train_data)
    train_clusters = em.predict(train_data)
    test_clusters = em.predict(test_data)
    # add the cluster assignment as a feature
    train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1)
    test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1)

    print('EM cluster NN')
    learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)
class DataCreator(object):
    def __init__(self):
        self.name = 'DataCreator Class'
        self.model = None
        self.events_per_centroid = None
        
    def fit(self, data, n_clusters=2):
        self.model = KMeans(n_clusters=n_clusters)
        self.model.fit(data)
        event_per_centroid = []
        output = self.model.predict(data)
        for icenter in range(n_clusters):
            event_per_centroid = (np.append(event_per_centroid,
                                            float(sum(output==icenter))/
                                            float(len(data))))
        self.events_per_centroid = event_per_centroid
            
    def create_events(self, data, n_events=100):
        output = self.model.predict(data)
        for icenter in range(len(self.events_per_centroid)):
            qtd_events = (np.float(n_events)*np.ceil(self.events_per_centroid[icenter])).astype(int)
            if qtd_events == 0:
                continue
            select_data = data[output==icenter,:]
            return select_data [np.random.randint(0, select_data.shape[0]-1, size=qtd_events),:]
Beispiel #5
0
class WordCluster(object):

    def __init__(self):
        self.train_list = self.build_training_set()
        # Initialize Kmeans
        self.kmeans = KMeans(n_clusters=2)
        self.kmeans.fit(self.train_list)
        self.centroids = self.kmeans.cluster_centers_
        self.labels = self.kmeans.labels_
        self.word_scope = ['global', 'local']

    @staticmethod
    def build_training_set():
        parameters = []
        with open('../data/local_params.dat', 'r') as fp:
            for line in fp:
                word_params = [float(x.replace(" ", "").replace(")", "").replace("(", ""))
                               for x in line[:-1].split(';')[0].split(',')]
                parameters.append(word_params)
        parameters = np.array(parameters)
        dim2array = zip(parameters[:, 1], parameters[:, 2])
        return dim2array

    def predict(self, params):
        centroid_test = self.kmeans.predict((0.1, 0.1))[0]
        if centroid_test == 1:
            self.word_scope = ['local', 'global']
        return self.word_scope[self.kmeans.predict(params)[0]]
class TextClusters:
	"""Tokenizes text, and fits to a KMeans model"""

	def __init__(self):
		self.stemmer = PorterStemmer()
		self.vectorizer = TfidfVectorizer()
		self.clf = KMeans(10)

	def tokenize(self,title):

		title = title.decode('latin1')
		title = [word for word in title.lower().split() if word not in punctuation]
		title = [self.stemmer.stem(word) for word in title]
		return " ".join(title)

	def fit(self,text):
		features = np.array([self.tokenize(title) for title in text])
	    
		X = self.vectorizer.fit_transform(features).toarray()
		self.clf.fit(X)

		return self.clf.predict(X)

	def predict_one(self,line):
		query = self.tokenize(line)
		query_vector = self.vectorizer.transform([query]).toarray()
		return self.clf.predict(query_vector)[0]
class joshkmeans(BaseEstimator, ClusterMixin):
	def __init__(self):
		self.k_means_back = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10)

	def fit(self, X, y):
		self.k_means_back.fit(X)
		self.result = self.k_means_back.predict(X)
		self.y_train = y

	def find_majority(self, k):
	    myMap = {}
	    maximum = ( '', 0 ) # (occurring element, occurrences)
	    for n in k:
	        if n in myMap: myMap[n] += 1
	        else: myMap[n] = 1

	        # Keep track of maximum on the go
	        if myMap[n] > maximum[1]: maximum = (n,myMap[n])

	    return maximum

	def predict(self, X):
		test = self.k_means_back.predict(X)

		#Maps the cluster labels back to the provided labels by comparing the predict results
		#to the results from training set
		for i in range(len(test)):
			cluster_label = test[i]
			lst = []
			for j in range(len(self.result)):
				if (self.result[j] == cluster_label):
					lst.append(self.y_train[j,0])
			val = self.find_majority(lst)[0]
			test[i] = val
		return test
Beispiel #8
0
def km(tx, ty, rx, ry, add="", times=5):
    #this does the exact same thing as the above
    errs = []

    checker = KM(n_clusters=2)
    checker.fit(ry)
    truth = checker.predict(ry)

    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
    plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add)

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)
    nn(newtx, ty, newrx, ry, add="onKM"+add)  
Beispiel #9
0
def compare_sklearn(x_train, x_test, y_train, y_test, k):
    """
    Apply the KMeans algorithm of sklearn to the input data set, and return its "accuracy"
    of assigning labels to clusters. Use k as the number of clusters learned by sklearn's KMeans
    :param x_train:
    :param x_test:
    :param y_train:
    :param y_test:
    :return: Accuracy of the clustering assignments, using the training set accuracy if test set is empty
    """
    # ## TODO: Your code here (Q6)

    # this code will call eval_clustering; see main for how to use
    clu = KMeans(n_clusters=k)
    clu.fit(x_train)

    if len(x_test) > 0:
        guess_clusters = clu.predict(x_test)
        truth = y_test

        print guess_clusters
        print truth

    else:
        guess_clusters = clu.predict(x_train)
        truth = y_train

    return eval_clustering(truth, guess_clusters)
Beispiel #10
0
def KMeans_(clusters, model_data, prediction_data = None):
    t0 = time()
    kmeans = KMeans(n_clusters=clusters).fit(model_data)
    if prediction_data == None:
        labels = kmeans.predict(model_data)
    else:
        labels = kmeans.predict(prediction_data)
    print "K Means Time: %0.3f" % (time() - t0)
    return labels
Beispiel #11
0
def runKmens(K):
    training,lable= genTrainingAndLableData()
    data=np.array(training)
    testData=readTestFile()
    test=np.array(testData)
    #y_pred = KMeans(n_clusters=K).fit_predict(data)
    y_pred = KMeans(n_clusters=K).fit(data)
    print y_pred.predict(data)
    return y_pred.cluster_centers_
def test_full_vs_elkan():

    km1 = KMeans(algorithm='full', random_state=13)
    km2 = KMeans(algorithm='elkan', random_state=13)

    km1.fit(X)
    km2.fit(X)

    homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0
def test_predict_equal_labels():
    km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1,
                algorithm='full')
    km.fit(X)
    assert_array_equal(km.predict(X), km.labels_)

    km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1,
                algorithm='elkan')
    km.fit(X)
    assert_array_equal(km.predict(X), km.labels_)
Beispiel #14
0
def test_predict():
    k_means = KMeans(k=n_clusters, random_state=42).fit(X)

    # sanity check: predict centroid labels
    pred = k_means.predict(k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = k_means.predict(X)
    assert_array_equal(k_means.predict(X), k_means.labels_)
Beispiel #15
0
def k_nearest_cluster(xs,ds,n):
    model = KMeans(n_clusters=n, precompute_distances = True, n_jobs=1)#multiparallel doesn't work :(
    model.fit(xs)
    frame_x = ps.DataFrame(model.predict(xs)[None].T,columns=["Mnew_knearest" + str(n)])
    frame_x.name = "Mnew_knearest" + str(n)

    frame_d = ps.DataFrame(model.predict(ds)[None].T,columns=["Mnew_knearest" + str(n)])
    frame_d.name = "Mnew_knearest" + str(n)

    return (frame_x, frame_d)
def kMeansClustering(train,evaluate,test):
    km = KMeans(n_clusters=40)
    f_train = km.fit_predict(train[['X','Y']])
    f_eval = km.predict(evaluate[['X','Y']])
    f_test = km.predict(test[['X','Y']])
    print km.cluster_centers_
    print f_train
    print f_eval
    print f_test
    return (f_train,f_eval,f_test)
Beispiel #17
0
 def KMeansClusering(self):
     for n in range(2,6):
         clusterer = KMeans(n_clusters=n, random_state=42)
         clusterer.fit(self.reduced_data)
         preds = clusterer.predict(self.reduced_data)
         centers =  clusterer.cluster_centers_
         sample_preds = clusterer.predict(self.pca_samples)
         score = metrics.silhouette_score(self.reduced_data, preds, metric='sqeuclidean')
         print("K Means with cluster number %d, score %0.3f"% (n, score))
     return
Beispiel #18
0
class EKGAnomalyDetection(TimeSeriesAnomalyDetection):
    def __init__(self, anomaly_fraction=.1, window=32, step=2, samples=200000):
        self.anomaly_fraction = anomaly_fraction
        self.window = window
        self.step = step
        self.samples = samples

    def build_model(self, x):
        self.window_vector = np.zeros(self.window)
        for i in xrange(self.window):
            w = np.sin(np.pi * i / (self.window - 1))
            self.window_vector[i] = np.square(w)

        if len(x.shape) == 2:
            x = x[:, 0]

        r = np.zeros((self.samples, self.window))
        for i in xrange(self.samples):
            offset = i * self.step
            row = x[offset:offset+self.window] * self.window_vector
            scale = np.linalg.norm(row)
            r[i, :] = row / scale

        self.model = KMeans(n_clusters=50, max_iter=20)
        self.model.fit(r)

    def reconstruct_signal(self, x):
        if len(x.shape) == 2:
            x = x[:, 0]

        reconstructed_signal = np.zeros(len(x))

        row = np.zeros(self.window)
        row[self.window/2:] = x[:self.window/2]
        scale = np.linalg.norm(row)
        row /= scale
        ndx = self.model.predict(row)[0]
        current = self.model.cluster_centers_[ndx, :]

        reconstructed_signal[:self.window/2] += current[self.window/2:]

        for i in xrange(self.window/2, len(x)-self.window/2, self.window/2):
            row = x[i-self.window/2:i+self.window/2] * self.window_vector
            scale = np.linalg.norm(row)
            if scale > 0:
                row /= scale
            else:
                row = np.zeros(self.window)

            ndx = self.model.predict(row)[0]
            current = self.model.cluster_centers_[ndx, :]

            reconstructed_signal[i-self.window/2:i+self.window/2] += current * scale

        return reconstructed_signal[:, np.newaxis]
	def clusterSignal(self):
		acc_XNormalized = self.normalize(self.data['Acc_X'])
		km = KMeans(n_clusters=5, init='k-means++')
		"""Binning Acc_X """
		acc_XNormalizedValues = acc_XNormalized.as_matrix()
		#print acc_XNormalizedValues
		km.fit(acc_XNormalizedValues.reshape(-1,1))
		km.predict(acc_XNormalizedValues.reshape(-1,1))
		#print km.labels_
		#for p in km.labels_: print p
		print km.cluster_centers_
Beispiel #20
0
 def optimalClustering(self):
     n =2
     clusterer = KMeans(n_clusters=n, random_state=42)
     clusterer.fit(self.reduced_data)
     preds = clusterer.predict(self.reduced_data)
     self.centers =  clusterer.cluster_centers_
     sample_preds = clusterer.predict(self.pca_samples)
     score = metrics.silhouette_score(self.reduced_data, preds, metric='sqeuclidean')
     print("K Means with cluster number %d, score %0.3f"% (n, score))
     rs.cluster_results(self.reduced_data, preds, self.centers, self.pca_samples)
     
     return
Beispiel #21
0
def find_clusters(ax,reduced_data, n_clusters = 2, color='blue', cmap=plt.get_cmap('bwr'),
    title='K-means clustering on the dataset\n'
          'Centroids are marked with white cross'):
    """
    http://scikit-learn.sourceforge.net/dev/auto_examples/cluster/plot_kmeans_digits.html
    """

    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    kmeans.fit(reduced_data)

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
    y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()

    dx = (x_max - x_min) / 30
    dy = (y_max - y_min) / 30
    x_min, x_max = x_min - dx, x_max + dx
    y_min, y_max = y_min - dy, y_max + dy
    
    npixels = 500
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    # h = .02    # point in the mesh [x_min, m_max]x[y_min, y_max].
    hx = (x_max - x_min) / npixels
    hy = (y_max - y_min) / npixels

    xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    #plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    ax.scatter(reduced_data[:, 0], reduced_data[:, 1], s=50, c=color, cmap=cmap)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    ax.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    ax.set_title(title)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())

    return kmeans.predict(reduced_data)
Beispiel #22
0
    def run(self, styleImage, styleMask, targetImage, targetMask, colors=4,outHdf5='masks.hdf5'):
        # Load images
        img_style = scipy.misc.imread(styleImage)
        if targetImage != None:
            img_content = scipy.misc.imread(targetImage)

        # Load masks
        mask_style = scipy.misc.imread(styleMask)
        mask_target = scipy.misc.imread(targetMask)

        # Save shapes
        style_shape = mask_style.shape
        target_shape = mask_target.shape
        if img_style.shape != style_shape:
            raise Exception('Style image and mask have different sizes!')
        if targetImage != None:
            if img_content.shape != target_shape:
                raise Exception('Content image and mask have different sizes!')

        # Run K-Means to get rid of possible intermediate colors
        style_flatten = mask_style.reshape(style_shape[0]*style_shape[1], -1)
        target_flatten = mask_target.reshape(target_shape[0]*target_shape[1], -1)

        kmeans = KMeans(n_clusters=colors, random_state=0).fit(style_flatten)

        # Predict masks
        labels_style = kmeans.predict(style_flatten.astype(float))
        labels_target = kmeans.predict(target_flatten.astype(float))

        style_kval = labels_style.reshape(style_shape[0], style_shape[1])
        target_kval = labels_target.reshape(target_shape[0], target_shape[1])

        # Dump
        f = h5py.File(outHdf5, 'w')

        for i in range(colors):
            f['style_mask_%d' % i] = (style_kval == i).astype(float)
            f['target_mask_%d' % i] = (target_kval == i).astype(float)

        # Torch style image save
        f['style_img'] = img_style.transpose(2, 0, 1).astype(float) / 255.
        if targetImage != None:
            f['content_img'] = img_content.transpose(2, 0, 1).astype(float) / 255.
            f['has_content'] = np.array([1])
        else:
            f['has_content'] = np.array([0])
        f['n_colors'] = np.array([colors]) # Torch does not want to read just number

        f.close()

        print ('Done!')
Beispiel #23
0
def predictCustomerEngagement(df,test_df=None):
	correct = 0
	X = np.array(df.drop(['events_plan'], 1).astype(float))
	X = preprocessing.scale(X)
	y = np.array(df['events_plan'])
	X_pca = PCA(n_components=2, whiten=True).fit_transform(X)
	clf = KMeans(n_clusters=2,max_iter=10,n_init=2,n_jobs=-1)
	clf.fit(X_pca)
	count_paid=0
	count_free=0
	centroids = clf.cluster_centers_
	lables = clf.labels_
	for i in range(len(X_pca)):
		predict_me = np.array(X_pca[i].astype(float))
		predict_me = predict_me.reshape(-1, len(predict_me))
		prediction = clf.predict(predict_me)
		plt.plot(X_pca[i][0], X_pca[i][1],colours[lables[i]],markersize=10)		    
		if prediction[0] == 0:
			count_paid += 1
		elif prediction[0] == 1:
			count_free += 1
		if prediction[0] == y[i]:
			correct += 1
	X2 = np.array(test_df.drop(['events_plan'], 1).astype(float))
	X2 = preprocessing.scale(X2)
	X2_pca = PCA(n_components=2, whiten=True).fit_transform(X2) 
	clf2 = KMeans(n_clusters=2,max_iter=10,n_init=2,n_jobs=-1)
	clf2.fit(X2_pca)
	centroids2 = clf2.cluster_centers_
	lables = clf2.labels_
	for i in range(len(X2_pca)):
		predict_me = np.array(X2_pca[i].astype(float))
		predict_me = predict_me.reshape(-1, len(predict_me))
		prediction = clf2.predict(predict_me)
		plt.plot(X2_pca[i][0], X2_pca[i][1],colours2[lables[i]],markersize=10)		    
	plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", c=('g','r'), s=150 ,zorder=10)
	imgdata = StringIO.StringIO()
	plt.savefig(imgdata, format='png')
	imgdata.seek(0)

	fig = plt.figure()
	ax = fig.gca()
	ax.pie((count_free,count_paid),colors=('r', 'g'),radius=0.25, center=(0.5, 0.5), frame=True)
	pieData = StringIO.StringIO()
	plt.savefig(pieData,format='png')
	pieData.seek(0)
	pieUri = 'data:image/png;base64,' + urllib.quote(base64.b64encode(pieData.buf))
	uri = 'data:image/png;base64,' + urllib.quote(base64.b64encode(imgdata.buf))
	return {'accuracy':(float(correct) / len(X_pca))*100,'centroids':centroids, 'points':X_pca ,'total':count_free+count_paid,'count_paid':count_paid,'count_free':count_free,'plot':uri,'pieUri':pieUri}
Beispiel #24
0
def plot_pca(data):
    train = [row[:-1] for row in data.examples]
    scaled = scale(train)

    reduced_data = PCA(n_components=2).fit_transform(scaled)
    kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10)
    kmeans.fit(reduced_data)

    cluster_label_dict = clusterLabelDict(kmeans, data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1
    y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure(1)
    pl.clf()
    pl.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=pl.cm.Paired,
          aspect='auto', origin='lower')

    pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    centroids = kmeans.cluster_centers_

    for i in range(len(centroids)):
        c0 = centroids[:, 0][i]
        c1 = centroids[:, 1][i]
        predicted = kmeans.predict(centroids[i])
        label = cluster_label_dict[predicted[0]]
        pl.scatter(c0, c1, marker='$%d$' % label,
            s=169, linewidths=3, color='w', zorder=10)

    pl.title('K-means clustering on digits, reduced to 2-D with PCA\n'
         'Each white number is the mode of its centroid.')
    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)
    pl.xticks(())
    pl.yticks(())
    pl.show()
class AdvancedModel():
    
    clusters = []
    
    # price class regression
    price_reg = LinearRegression()
        
    def fit(self, X_train, y_train, n_clusters=4):
        y_train_mat = np.array(y_train).reshape((-1,1))
        
        # 1. determine clusters
        self.km = KMeans(n_clusters=5)
        self.km.fit(y_train_mat)
        clusters = self.km.cluster_centers_
        cluster_indices = self.km.predict(y_train_mat)
        print(clusters)
        
        # 2. fit naive bayes
        #self.nb.fit(X_train, ...)
        #self
        
        # 3. train regression model
        #price_reg.fit
        
    def predict(self, X):
        pass
        
    def get_weights(self):
        return np.append(self.price_reg.coef_, [self.price_reg.intercept_])
        
    def set_weights(self, w):
        self.price_reg.coef_ = w[:-1]
        self.price_reg.intercept_ = w[-1]
        
def test_predict():
    km = KMeans(n_clusters=n_clusters, random_state=42)

    km.fit(X)

    # sanity check: predict centroid labels
    pred = km.predict(km.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = km.predict(X)
    assert_array_equal(pred, km.labels_)

    # re-predict labels for training set using fit_predict
    pred = km.fit_predict(X)
    assert_array_equal(pred, km.labels_)
Beispiel #27
0
 def pca_k_means(self):
     if not self.pca_reduced:
         self.pc_analysis()
     kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
     kmeans.fit(self.pca_reduced, self.player_value)
     h = .02
     x_min, x_max = self.pca_reduced[:, 0].min() - 1, self.pca_reduced[:, 0].max() + 1
     y_min, y_max = self.pca_reduced[:, 1].min() - 1, self.pca_reduced[:, 1].max() + 1
     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
     Z = Z.reshape(xx.shape)
     plt.figure(1)
     plt.clf()
     plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                cmap=plt.cm.Paired, aspect='auto', origin='lower')
     plt.plot(self.pca_reduced[:, 0], self.pca_reduced[:, 1], 'k.', markersize=2)
     centroids = kmeans.cluster_centers_
     labels = self.pca_labels = kmeans.labels_
     intertia = kmeans.inertia_
     plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10)
     plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
               'Centroids are marked with white cross')
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
     plt.xticks(())
     plt.yticks(())
     return {'plt': plt, 'centroids': centroids, 'labels': labels, 'inertia': intertia}
Beispiel #28
0
def re_classify_dict():
    dict_file = open("_dictionary.pickle", "rb")
    sc_list = cPickle.load(dict_file)
    sc_list = np.concatenate(sc_list)

    Dh_dict = sc_list[:, 144:]
    Dl_dict = sc_list[:, :144]

    k_means = KMeans(n_clusters=15)
    k_means = k_means.fit(Dl_dict)
    y_predict = k_means.predict(Dl_dict)

    num = []
    y_tmp = np.asarray(y_predict, dtype=int) * 0 + 1
    for i in range(len(np.unique(y_predict))):
        num.append(np.sum(y_tmp[y_predict == i]))
    rand = np.asarray(num).argsort()  # 按照各个类别patch个数从少到多排序的类别索引

    classified_hdict = []
    classified_patch = []
    for i in rand:
        predict_temp = y_predict == i
        classified_hdict.append(Dh_dict[predict_temp])
        print len(classified_hdict[-1])

    for i in range(9):
        x = i % 3
        y = i / 3
        # 进行一次系数编码测试
        patch_show(classified_hdict[i+5][:100], [0.05+x*0.31, 0.05+y*0.31, 0.3, 0.3], i)

    plt.show()
Beispiel #29
0
 def add_kmeans_col(self, iter = 1000, n_init = 10, n = 4):
     '''Add a new k_means cluster column to X data'''
     logging.info('Adding kmeans %d clusters to X' %(n))
     km = KMeans(n_clusters=n, max_iter=iter, n_init=n_init)
     km.fit(self.X[:,1:]) # XXX: This might not be kosher as it affects all of X
     self.models['km-col'] = km        
     self.X = np.hstack( (self.X, km.predict(self.X[:,1:]).reshape(-1,1)) )   
Beispiel #30
0
def findColor(frame):
    t = time()
    # dim = np.array(frame.size)/2
    # frame.thumbnail(dim, Image.ANTIALIAS)
    # print "Thumbnail in %0.3f seconds." % (time() - t)
    # t = time()
    points = imresize(np.array(frame, dtype=np.float64), 0.3)
    w,h,d = points.shape
    data = np.reshape(points, (w*h, d))
    sample = shuffle(data, random_state=0)[:len(data)/3]
    print "Reshape and shuffle in %0.3f seconds." % (time() - t)
    t = time()
    kmeans = KMeans(n_clusters=k_colors, n_jobs=jobs).fit(sample)
    labels = kmeans.predict(data)
    print "Fit and predict in %0.3f seconds." % (time() - t)
    t = time()
    colors = [map(int, color) for color in kmeans.cluster_centers_]
    # hsvs = np.array([rgb_to_hsv(*values) for values in colors])
    # frequent = np.argmax(hsvs[:,1])
    # frequent = colors[frequent]
    print "Found in %0.3f seconds." % (time() - t)
    frequents = defaultdict(int)
    for l in labels:
        frequents[l] += 1
    frequents = sorted(frequents.items(), key=lambda x:x[1], reverse=True)
    frequents = [colors[i[0]] for i in frequents[:3]]
    # print "Counted in %0.3f seconds." % (time() - t)
    # print "Top 3 colors [RGB]: ", frequents[:3]
    return frequents[2] if len(frequents) == 3 else frequents[0]
Beispiel #31
0
plt.imshow(orderedDataMatrix, aspect='auto', interpolation='nearest')
plt.grid(False)
plt.title("Heatmap of Iris characteristics")
plt.colorbar()
plt.xticks([x for x in range(6)], labels_t)
plt.savefig("week10_heatmap.png")  # Save the image
plt.close()

plt.figure()
hac.dendrogram(linkageMatrix_transposed, labels=labels)
plt.savefig('dendrogram10.png')
plt.close()

kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(dataMatrixnp)
labels = kmeans.predict(dataMatrixnp)
dataMatrix_df = pd.merge(pd.DataFrame(
    dataMatrixnp, columns=['CFU', 'poly', 'unk', 'int', 'mys', 'mid']),
                         pd.DataFrame(labels, columns=['cluster']),
                         left_index=True,
                         right_index=True)
k_clustered = dataMatrix_df.sort_values('cluster')[[
    'CFU', 'poly', 'unk', 'int', 'mys', 'mid'
]].values

plt.figure()
plt.imshow(k_clustered, aspect='auto', interpolation='nearest')
plt.grid(False)
plt.title("Heatmap of Iris characteristics")
plt.colorbar()
plt.xticks([x for x in range(6)], labels_t)
plt.plot(k_rng[1:], silhouette_score, 'b*-')
plt.xlim([1, 15])
plt.grid(True)
plt.ylabel('Silhouette Coefficient')
plt.xlabel("Values of K")
plt.plot(3,
         silhouette_score[1],
         'o',
         markersize=12,
         markeredgewidth=1.5,
         markerfacecolor='None',
         markeredgecolor='r')
###plot three clusters
est = KMeans(n_clusters=3, init='random')
est.fit(d1)
y_kmeans = est.predict(d1)
colors = np.array(['r', 'b', 'g'])
plt.figure()
plt.scatter(d1.DAY_OF_WEEK, d1.HOUR_ARR, c=colors[y_kmeans], s=50)
plt.xlim(1.5, 8.5)
plt.xticks([2, 3, 4, 5, 6, 7, 8],
           ['Mon', 'Tue', 'Wed', "Thu", "Fri", "Sat", "Sun"])
plt.ylim(-.5, 24)
plt.yticks([0, 6, 12, 18], ["12:00 AM", "6:00 AM", "12:00 PM", "6:00 PM"])
plt.title("Rush Hour Determination from K-Means Clustering")
t2['cluster'] = y_kmeans

commuter_hours = t2[t2.cluster == 1]["HOUR_OF_WEEK"]

winter["rush"] = [
    1 if x in set(commuter_hours) else 0 for x in winter.HOUR_OF_WEEK
Beispiel #33
0
# import numpy as np
# import sklearn
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

N = 10000
centers = 4
X, Y = make_blobs(n_samples=N, n_features=2, centers=centers, random_state=28)

km = KMeans(n_clusters=centers, init='random', random_state=28)
km.fit(X)

print(Y)

print(
    '---------------------------------------------------------------------------'
)
y_hat = km.predict(X[:10])
print(y_hat)
class UnsupervisedKmeansBowModel(UnsupervisedBaseModel):
    def __init__(self, task):
        super(UnsupervisedKmeansBowModel, self).__init__(task)
        self.num_clusters = 4  # combinations of social and agency
        self.text_repr_model = self.get_text_representation_model()
        self.clf_model = KMeans(init='k-means++',
                                n_clusters=self.num_clusters,
                                n_init=10,
                                random_state=self.args.random_state)

    def augment_features(self, X_text, X_all_feats):

        if not self.args.use_allfeats:
            return X_text.toarray()

        age = X_all_feats[:, 2].reshape(-1, 1)
        gender = X_all_feats[:, 3].reshape(-1, 1)
        married = X_all_feats[:, 4].reshape(-1, 1)
        parenthood = X_all_feats[:, 5].reshape(-1, 1)
        country = X_all_feats[:, 6].reshape(-1, 1)
        reflection = X_all_feats[:, 7].reshape(-1, 1)
        duration = X_all_feats[:, 8].reshape(-1, 1)

        X_all = np.concatenate([
            X_text.toarray(), age, gender, married, parenthood, country,
            reflection, duration
        ],
                               axis=1)

        return X_all

    def get_text_representation_model(self):
        steps = []
        vectorizer = TfidfVectorizer(ngram_range=(1, self.args.ngrams),
                                     min_df=5,
                                     max_df=0.5,
                                     stop_words="english",
                                     use_idf=False)
        steps.append(('vec', vectorizer))
        repr_model = Pipeline(steps)
        return repr_model

    def train(self, X, y=None):
        X, y = self.augment_instances(X, y)
        X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL])
        X_all_feats = self.augment_features(X_text, X)

        pca = PCA(n_components=self.num_clusters,
                  random_state=self.args.random_state)
        pca.fit(X_all_feats)

        model = KMeans(init=pca.components_,
                       n_clusters=self.num_clusters,
                       n_init=1,
                       random_state=self.args.random_state)
        model.fit(X_all_feats)

        self.clf_model = model

    def predict(self, X):
        X_text = self.text_repr_model.transform(X[:, self.args.TEXT_COL])
        X_all_feats = self.augment_features(X_text, X)
        y_pred = self.clf_model.predict(X_all_feats)

        y = y_pred.astype(np.uint8)
        y = np.unpackbits(y)
        y = y.reshape(y_pred.shape[0], 8)
        y = y[:, -2:]
        y = y[:, ::-1]

        return y
corpus = open('/Users/mccallmathers./Desktop/NLP/dataset.txt').read()

docs = corpus.split('\n')

X = []

for doc in docs:
    i, l = doc.split(':')
    X.append(i.strip())

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()

matrix_X = vec.fit_transform(X)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, max_iter=300, tol=1e-4)

kmeans.fit(matrix_X[:5])

print(kmeans.labels_)
print(kmeans.predict(matrix_X[5]))

from sklearn.neighbors import NearestNeighbors
kn = NearestNeighbors()
kn.fit(matrix_X)

print(kn.kneighbors(matrix_X[3], 2))
kn.radius_neighbors(matrix_X[3], radius=1.7)
wcss= []  ##Within Cluster Sum of Squares
##elbow method to know the number of clusters
for i in range(1,11):
    kmeans= KMeans(n_clusters=i,
max_iter=300,random_state=0)
    kmeans.fit(z)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('the elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('Wcss')
plt.show()

#Silhouette score
# predict the cluster for each data point
y_cluster_kmeans = km.predict(z)
from sklearn import metrics
score = metrics.silhouette_score(z, y_cluster_kmeans)
print("The Silhouette score is",score)

from sklearn import preprocessing
scaler =preprocessing.StandardScaler()
scaler.fit(z)
X_scaled_array=scaler.transform(z)
X_scaled=pd.DataFrame(X_scaled_array, columns =z.columns)

print("Feature Scaling",X_scaled)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
Beispiel #37
0
### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2 in finance_features:
    plt.scatter(f1, f2)
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred

from sklearn.cluster import KMeans

clf = KMeans(n_clusters=2)
clf.fit(finance_features)
pred = clf.predict(finance_features)

### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred,
         finance_features,
         poi,
         mark_poi=False,
         name="clusters.pdf",
         f1_name=feature_1,
         f2_name=feature_2)
except NameError:
    print("no predictions object named pred found, no clusters to plot")
Beispiel #38
0
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
y = np.array(test_target["Survived"])

# PCA on data
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# KMeans
clf = KMeans(n_clusters=2)
clf.fit(X_train)
clf_pred = clf.predict(X_test)
correct = 0

# Calculate Score

for i in range(len(clf_pred)):
    if clf_pred[i] == y[i]:
        correct += 1

print(max(1 - correct / len(clf_pred), correct / len(clf_pred)))

PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(clf_pred, PassengerId, columns=["Survived"])
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("KMeans.csv", index_label=["PassengerId"])
print("centers:", model.cluster_centers_)
print("labels", labels)
print("intertia:", model.inertia_)

texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
    for label in labels:
        if label == i_cluster:
            texts_per_cluster[i_cluster] += 1

print("Top words per cluster:")
for i_cluster in range(n_clusters):
    print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
    for term in ordered_words[i_cluster, :10]:
        print("\t" + words[term])

print("\n")
print("Prediction")

text_to_predict = "Why batman was defeated  by superman so easy?"
Y = vectorizer.transform([text_to_predict])
predicted_cluster = model.predict(Y)[0]
texts_per_cluster[predicted_cluster] += 1

print(text_to_predict)
print("Cluster:", predicted_cluster, "texts:",
      int(texts_per_cluster[predicted_cluster])),
for term in ordered_words[predicted_cluster, :10]:
    print("\t" + words[term])
    for k in range(K):
        index = np.where(idx == k)[0]  # 一个簇一个簇的分开来计算
        temp = X[index, :]  # ? by m # 每次先取出一个簇中的所有样本
        s = np.sum(temp, axis=0)
        centriod[k, :] = s / np.size(index)
    return centriod


def kmeans(X, K, max_iter=200):
    centroids = InitCentroids(X, K)
    idx = None
    for i in range(max_iter):
        idx = findClostestCentroids(X, centroids)
        centroids = computeCentroids(X, idx, K)
    return idx


if __name__ == '__main__':
    x, y = load_data()
    K = len(np.unique(y))
    y_pred = kmeans(x, K)
    nmi = normalized_mutual_info_score(y, y_pred)
    print("NMI by ours: ", nmi)

    model = KMeans(n_clusters=K)
    model.fit(x)
    y_pred = model.predict(x)
    nmi = normalized_mutual_info_score(y, y_pred)
    print("NMI by sklearn: ", nmi)

features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2, _ in finance_features:
    plt.scatter(f1, f2)
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
model = KMeans(n_clusters=2)
model.fit(finance_features)
pred = model.predict(finance_features)

### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred,
         finance_features,
         poi,
         mark_poi=False,
         name="clusters.pdf",
         f1_name=feature_1,
         f2_name=feature_2)
except NameError:
    print("no predictions object named pred found, no clusters to plot")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 19 14:08:13 2018

@author: suyash
"""

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

iris = pd.read_csv('iris.csv')
iris = np.array(iris)
iris = np.transpose(iris)

model = KMeans(n_clusters=3)

model.fit(iris)

labels = model.predict(iris)

plt.scatter(iris[:, 0], iris[:, 1], iris[:, 2], c=labels, marker='o')
centroids = model.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='D')
plt.show()
Beispiel #43
0
from display_network import *
from mnist import MNIST  #require pip install python-mnist
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

mndata = MNIST("MNIST/")  #path to your MNIST folder
mndata.load_testing()
X = mndata.test_images
X0 = np.asarray(X)[:1000, :] / 256.0
X = X0

K = 10
kmeans = KMeans(n_clusters=K).fit(X)
pred_label = kmeans.predict(X)

print(type(kmeans.cluster_centers_.T))
print(kmeans.cluster_centers_.T.shape)
A = display_network(kmeans.cluster_centers_.T, K, 1)
f1 = plt.imshow(A, interpolation='nearest', cmap="jet")
f1.axes.get_xaxis().set_visible(False)
f1.axes.get_yaxis().set_visible(False)
plt.show()
#plt.savefig('a1.png', bbox_inches='tight')

#a colormap and a mormalization instance
cmap = plt.cm.jet
norm = plt.Normalize(vmin=A.min(), vmax=A.max())

#map the normalized data to colors
        salaries.append(stock)

print("Maximum Value: {}".format(max(salaries)))
print("Minimum Value: {}".format(min(salaries)))


### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to 
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2, _ in finance_features:
    plt.scatter( f1, f2 )
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0, max_iter=100).fit(finance_features)

pred = kmeans.predict(finance_features)




### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"
Beispiel #45
0
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]'''

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

digits_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra',header=None)
digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes',header=None)
X_train = digits_train[np.arange(64)]
y_train = digits_train[64]
X_test = digits_test[np.arange(64)]
y_test = digits_test[64]

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 10)
kmeans.fit(X_train)
y_pred = kmeans.predict(X_test)
from sklearn import metrics
print(metrics.adjusted_rand_score(y_test,y_pred))#ARI进行聚类性能评估

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

plt.subplot(3,2,1)#分割出6个子图,并在1号子图作画
x1 = np.array([1,2,3,1,5,6,5,5,6,7,8,9,7,9])
x2 = np.array([1,3,2,2,8,6,7,6,7,1,2,1,1,3])
X = np.array(zip(x1,x2)).reshape(2,len(x1)

plt.xlim([0,10])
plt.ylim([0,10])
Beispiel #47
0
class PartitionedXgbRegressor(TransformerMixin):
    """An xgboost regressor variant with implicit inverse class frequency weighting, and a few other tricks

    This is a passthrough to xgboost's standard XgbRegressor, with an added preprocessing stage, intended for
    use with the NystroemSpectralProjection class, and a KMeans clustering stage, which is used to compute
    sample weighting.  The premise is that the combination, which amounts to a particular graph spectral clustering,
    represents an implicit set of categorical variables that are partially driving the behavior of a continuous
    output variable.  We attempt to counteract this by applying an inverse-frequency weighting scheme based on
    an implicit set of classes defined by the clustering.

    Attributes
    ----------
    clusterer : KMeans

    n_augment_cols : int
        Number of trailing pass-through columns (for the purpose of clustering / preprocessing)
    """
    def __init__(self,
                 base_estimator=XGBRegressor(),
                 n_augment_cols=1,
                 preprocess=None,
                 n_clusters=8,
                 augments_only=False):
        """Construct a new PartitionedXgbRegressor model

        Parameters
        ----------
        n_augment_cols : int
            Number of trailing pass-through columns (for the purpose of clustering / preprocessing)

        preprocess : TransformerMixin
            Preprocessing stage compatible with sklearn's TransformerMixin interface

        n_clusters : int
            Number of k-means clusters to use as implicit class labels
        """
        self.base_estimator = base_estimator
        self.clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1)
        self.estimator_ = None
        self.n_augment_cols = n_augment_cols
        self.preprocess = preprocess
        self.augments_only = augments_only

    def fit(self, X, y=None, weights=None, **kwargs):
        """Fit regressor

        Note
        ----
        Provided weights are unused.  Keyword arguments to support early stopping are currently required.

        Parameters
        ----------

        X : ndarray
            N samples x M dimensions ndarray containing the data to fit

        y : ndarray
            N element ndarray containing the target values

        weights : None
            Unused

        kwargs : dict
            Required keys: eval_set, eval_metric, early_stopping_rounds.  Values as specified by xgboost docs.

        """
        if self.preprocess is not None:
            X = np.hstack([
                self.preprocess.transform(X[:, :-self.n_augment_cols]),
                X[:, -self.n_augment_cols:].reshape((-1, self.n_augment_cols))
            ])
            eval_X = np.hstack([
                self.preprocess.transform(
                    kwargs["eval_set"][0][:, :-self.n_augment_cols]),
                kwargs["eval_set"][0][:, -self.n_augment_cols:].reshape(
                    (-1, self.n_augment_cols))
            ])
        else:
            eval_X = kwargs["eval_set"][0]

        X_cats = self.clusterer.fit_predict(X[:, :-self.n_augment_cols])
        eval_cats = self.clusterer.predict(eval_X[:, :-self.n_augment_cols])

        # NOTE: left end of clip should be unnecessary, right end should be max_gain parameter
        weight_map = {
            c: np.clip(X_cats.size / X_cats[X_cats == c].size, 1.0, 1000.0)
            for c in np.unique(X_cats)
        }
        print(weight_map)

        W = np.asarray([weight_map.get(c, 1.0) for c in X_cats])
        eval_W = np.asarray([weight_map.get(c, 1.0) for c in eval_cats])

        # TODO: do something with the augments_only option
        eset = (eval_X, kwargs["eval_set"][1])

        reg = clone(self.base_estimator)
        reg.base_score = np.mean(y)
        reg.fit(X,
                y,
                W,
                eval_set=[eset],
                eval_metric=kwargs["eval_metric"],
                sample_weight_eval_set=[eval_W],
                early_stopping_rounds=kwargs["early_stopping_rounds"],
                verbose=kwargs.get("verbose", False))
        self.estimator_ = reg

    def predict(self, X):
        """Predict values for new samples"""
        assert self.estimator_ is not None, "Cannot Predict: Model has not been trained"
        if self.preprocess is not None:
            X = np.hstack([
                self.preprocess.transform(X[:, :-self.n_augment_cols]),
                X[:, -self.n_augment_cols:].reshape((-1, self.n_augment_cols))
            ])

        return self.estimator_.predict(X)
Beispiel #48
0
# -*- coding:UTF-8 -*-

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# 加载数据集
dataMat = []
fr = open("data/10.KMeans/testSet.txt") # 注意,这个是相对路径,请保证是在 MachineLearning 这个目录下执行。
for line in fr.readlines():
    curLine = line.strip().split('\t')
    fltLine = list(map(float,curLine))    # 映射所有的元素为 float(浮点数)类型
    dataMat.append(fltLine)

# 训练模型
km = KMeans(n_clusters=4) # 初始化
km.fit(dataMat) # 拟合
km_pred = km.predict(dataMat) # 预测
centers = km.cluster_centers_ # 质心

# 可视化结果
plt.scatter(np.array(dataMat)[:, 1], np.array(dataMat)[:, 0], c=km_pred)
plt.scatter(centers[:, 1], centers[:, 0], c="r")
plt.show()
user_unique_id = pd.merge(user_unique_id,
                          recent_purchase[['customerid', 'Recency']],
                          on='customerid')

# print(user_unique_id)
#how many clusters are required
sse = {}
tx_recency = user_unique_id[['Recency']]
for k in range(1, 8):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    #tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

kmeans = KMeans(n_clusters=4)
kmeans.fit(user_unique_id[['Recency']])
user_unique_id['RecencyCluster'] = kmeans.predict(user_unique_id[['Recency']])

#print(user_unique_id['RecencyCluster'].describe())

#function for ordering cluster numbers

#print(user_unique_id.groupby('RecencyCluster')['Recency'].describe())


def order_cluster(cluster_field_name, target_field_name, df, ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(
        cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,
                                ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
Beispiel #50
0
    #Save within-cluster sums of squares to the list
    wcss.append(kmeans.inertia_)

#Display the graph
print(wcss)
plt.plot(range(1, 10), wcss)
plt.title('the elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

#From the map, at k=3 seem like data slowly unchange => choose k=3
#Silhouette score
km = KMeans(n_clusters=3)
km.fit(x)
y_cluster_kmeans = km.predict(x)
score = metrics.silhouette_score(x, y_cluster_kmeans)
print()
print('Silhouette score for', 3, 'clusters', score)

###########################################################################
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(x)
X_scaled_array = scaler.transform(x)
X_scaled = pd.DataFrame(X_scaled_array, columns=x.columns)

km = KMeans(n_clusters=3)
km.fit(X_scaled)
y_cluster_kmeans = km.predict(X_scaled)
Beispiel #51
0
def dataprep(data_in, depvar='default_time', splitvar='time', threshold=26):
    
    df=data_in.dropna(subset=['time', 'default_time','LTV_time', 'FICO_orig_time']).copy()
    
    # Economic features
    df.loc[:, 'annuity'] = ((df.loc[:,'interest_rate_time']/(100*4))*df.loc[:,'balance_orig_time'])/(1-(1+df.loc[:,'interest_rate_time']/(100*4))**(-(df.loc[:,'mat_time']-df.loc[:,'orig_time'])))
    df.loc[:,'balance_scheduled_time']  = df.loc[:,'balance_orig_time']*(1+df.loc[:,'interest_rate_time']/(100*4))**(df.loc[:,'time']-df.loc[:,'orig_time'])-df.loc[:,'annuity']*((1+df.loc[:,'interest_rate_time']/(100*4))**(df.loc[:,'time']-df.loc[:,'orig_time'])-1)/(df.loc[:,'interest_rate_time']/(100*4))
    df.loc[:,'property_orig_time'] = df.loc[:,'balance_orig_time']/(df.loc[:,'LTV_orig_time']/100)
    df.loc[:,'cep_time']= (df.loc[:,'balance_scheduled_time'] - df.loc[:,'balance_time'])/df.loc[:,'property_orig_time']

    df.loc[:,'equity_time'] = 1-(df.loc[:,'LTV_time']/100)

    df=df.dropna(subset=['time', 'cep_time', 'equity_time'])
    
    df.loc[:,'age'] = (df.loc[:,'time']-df.loc[:,'first_time']+1)
    df.loc[df['age'] >= 40, 'age'] = 40    
    df.loc[:,'age_1'] = df.loc[:,'time']-df.loc[:,'first_time']
    df.loc[df['age_1'] >= 39, 'age_1'] = 39
    df.loc[:,'age_1f'] = df.loc[:,'age_1']
    df.loc[df['age_1f'] <= 1, 'age_1f'] = 1
    df.loc[:,'age2'] = df.loc[:,'age']**2
    
    df['vintage'] = df.loc[:,'orig_time']
    df.loc[df['vintage'] < 0, 'vintage'] = 0
    df.loc[df['vintage'] >= 30, 'vintage'] = 30
    df.loc[:,'vintage2'] = df.loc[:,'vintage']**2
    
    df.loc[:,'state_orig_time'] = pd.Categorical(df.state_orig_time, ordered=False)

    if depvar=='default_time':
        df2 = df

        df2 = df2.loc[df2['state_orig_time'] != 'AL',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'AK',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'AR',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'ND',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'SD',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'MT',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'DE',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'WV',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'VT',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'ME',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'NE',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'NH',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'MS',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'VI',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'DC',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'PR',:].copy()
        df2 = df2.loc[df2['state_orig_time'] != 'nan',:].copy() 
        
        # Splitting
        data_train = df2.loc[df2[splitvar] < threshold+1,:].copy()
        data_test = df2.loc[df2[splitvar] > threshold,:].copy()

        # PCA
        defaultrates_states_train = data_train.groupby(['time', 'state_orig_time'])['default_time'].mean().unstack(level=1).add_prefix('defaultrate_').fillna(0).reset_index(drop=False)
        defaultrates_states = df2.groupby(['time', 'state_orig_time'])['default_time'].mean().unstack(level=1).add_prefix('defaultrate_').fillna(0).reset_index(drop=False)
        
        scaler = StandardScaler().fit(defaultrates_states_train)
        defaultrates_states_train1 = scaler.transform(defaultrates_states_train)
        defaultrates_states1 = scaler.transform(defaultrates_states)

        pca = PCA()
        pca.fit(defaultrates_states_train1)  
        z_train = pca.transform(defaultrates_states_train1)
        z = pca.transform(defaultrates_states1)
        z_train = z_train[:,0:5]
        z = z[:,0:5]

        Z_train = pd.DataFrame(data=z_train, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])
        Z = pd.DataFrame(data=z, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])

        Z_train_1 = Z_train.shift(1).add_suffix('_1')
        Z_1 = Z.shift(1).add_suffix('_1')

        defaultrates_states_train2 = pd.concat([defaultrates_states_train['time'], Z_train_1], axis=1).dropna(subset=['PCA1_1']).copy()
        defaultrates_states2 = pd.concat([defaultrates_states['time'], Z_1], axis=1).dropna(subset=['PCA1_1']).copy() 

        data_train = pd.merge(data_train, defaultrates_states_train2, on='time')
        df3 = pd.merge(df2, defaultrates_states2, on='time')

        data_test = df3.loc[df3[splitvar] > threshold,:].copy()
        
        # Scaling
        X_train = data_train[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time',  'gdp_time', 'PCA1_1','PCA2_1', 'PCA3_1','PCA4_1','PCA5_1']].dropna()
        X_test = data_test[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time',  'gdp_time',  'PCA1_1','PCA2_1', 'PCA3_1','PCA4_1','PCA5_1']].dropna()
        
        
        scaler = StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        y_train = data_train['default_time'].values.reshape(-1,)
        y_test = data_test['default_time'].values.reshape(-1,)

        # Clustering
        n_clusters = 2
        kmeans = KMeans(n_clusters=n_clusters, random_state=2, verbose=0)
        kmeans.fit(X_train_scaled)

        clusters_train =kmeans.predict(X_train_scaled)
        clusters_test = kmeans.predict(X_test_scaled)

        dummies_train = pd.get_dummies(clusters_train, drop_first=True, prefix='cluster')
        dummies_test = pd.get_dummies(clusters_test, drop_first=True, prefix='cluster')
        
        X_train_scaled = np.append(X_train_scaled, dummies_train, axis=1)
        X_test_scaled = np.append(X_test_scaled, dummies_test, axis=1)

        dummies = pd.concat([dummies_train, dummies_test], axis=0, ignore_index=True)
        dummies = dummies.reindex(data.index)

        df3 = pd.concat([df3, dummies], axis=1).dropna(subset=['id'])
        data_train = pd.concat([data_train, dummies_train], axis=1)
        dummies_test = dummies_test.reindex(data_test.index)
        data_test  = pd.concat([data_test,  dummies_test],  axis=1)
        
    if depvar=='lgd_time':
        
        # LGD dataprep
        df2 = df.query('default_time == 1').copy()    
        df3 = resolutionbias(df2,'lgd_time','res_time','time')
        
        df3.loc[df3['lgd_time'] <= 0, 'lgd_time'] = 0.0001
        df3.loc[df3['lgd_time'] >= 1, 'lgd_time'] = 0.9999

        # Splitting
        data_train =df3.loc[df3[splitvar] < threshold+1,:].copy()
        data_test =df3.loc[df3[splitvar] > threshold,:].copy()
        
        X_train = data_train[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'REtype_CO_orig_time', 'REtype_PU_orig_time', 'gdp_time']]
        X_test = data_test[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'REtype_CO_orig_time', 'REtype_PU_orig_time', 'gdp_time']]
    
        y_train = data_train['lgd_time'].values.reshape(-1,)
        y_test = data_test['lgd_time'].values.reshape(-1,)
        
        # Scaling
        scaler = StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        dummies_train = pd.get_dummies(data_train.state_orig_time, drop_first=True, prefix='state_orig_time')
        dummies_test = pd.get_dummies(data_test.state_orig_time, drop_first=True, prefix='state_orig_time')

        X_train_scaled = np.append(X_train_scaled, dummies_train, axis=1)
        X_test_scaled = np.append(X_test_scaled, dummies_test, axis=1)     
    
    return df3, data_train, data_test, X_train_scaled, X_test_scaled, y_train, y_test
df = pd.DataFrame({'labels': labels, 'companies': companies})

print(df.sort_values('labels'))

# In[81]:

# PCA Analysis using Singular value decomposition
from sklearn.decomposition import PCA

reduced_data = PCA(n_components=2).fit_transform(new)

#running K-Means on reduced data

kmeans = KMeans(n_clusters=10, max_iter=1000)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)

df = pd.DataFrame({'labels': labels, 'companies': companies})

print(kmeans.inertia_)
print(df.sort_values('labels'))

# In[97]:

h = 0.01

#printing the decision boundary

x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
finance_features = finance_features  + [200000.,1000000.]


from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train_std)
kmeans.fit(X_train_std)
kmeans.labels_




label = kmeans.labels_

pred = kmeans.predict(X_train_std)

predict1 = kmeans.predict([200000.,1000000.])

print predict1

centers = kmeans.cluster_centers_




### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
Beispiel #54
0
df = pd.DataFrame({
    'x': [
        12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69,
        72
    ],
    'y':
    [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24]
})

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(df)

labels = kmeans.predict(df)
centroids = kmeans.cluster_centers_

fig = plt.figure(figsize=(5, 5))

colors = map(lambda x: colmap[x + 1], labels)

plt.scatter(df['x'], df['y'], color=colors, alpha=0.5, edgecolor='k')
for idx, centroid in enumerate(centroids):
    plt.scatter(*centroid, color=colmap[idx + 1])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()
# kmeans
plt.figure()


for k in range(3,20,1):
    clf = KMeans(n_clusters=k)
    s = clf.fit(x_train)

    print(s) 
#    print clf.cluster_centers_
#    print clf.labels_
    k_labels=clf.labels_
    print (clf.inertia_)  
    k_inertia=clf.inertia_
    # print clf.predict(x_test)  
    k_pred=clf.predict(x_test)
    plt.plot(k,k_inertia,c='g',marker='x')

plt.show()


for k in range(3,9):
    clf = KMeans(n_clusters=k) 
    s = clf.fit(x_train) 
    numSamples = len(x_train)
    centroids = clf.labels_
    # print centroids,type(centroids)
    print (clf.inertia_) 
    # k_inertia = clf.inertia_
    # k_pred = clf.predict(x_test)
    # plt.plot(k, k_inertia, c='g', marker='x')
Beispiel #56
0
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import mglearn
import numpy as np
##  Super basic k-means clusters
X, y = make_blobs(random_state=1)

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

print('Cluster Membership:\n{}'.format(kmeans.labels_))
print(kmeans.predict(X))

mglearn.discrete_scatter(X[:, 0], X[:, 1], kmeans.labels_, markers='o')
mglearn.discrete_scatter(kmeans.cluster_centers_[:, 0],
                         kmeans.cluster_centers_[:, 1], [0, 1, 2],
                         markers='^',
                         markeredgewidth=2)
##  Changing number of categories to show lack of apriori meaning
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
assignments = kmeans.labels_
mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[0])

kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
assignments = kmeans.labels_
mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[1])
from sklearn.datasets.samples_generator import make_blobs

sns.set()
'''
    K-Means
'''

# 设置随机样例点
X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.show()

# 4中心聚类实现对上面样例数据的聚类
est = KMeans(4)
est.fit(X)
y_kmeans = est.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='rainbow')
plt.show()
'''
    手写数字应用
'''
# 加载数据
digits = load_digits()

# 加载模型
est = KMeans(n_clusters=10)
clusters = est.fit_predict(digits.data)
print(est.cluster_centers_.shape)  # (10, 64)

# 显示10个数字
fig = plt.figure(figsize=(8, 3))
Deaths_df = pd.read_csv(
    r"C:\Users\NehaS\Desktop\CS418\MEASURESOFBIRTHANDDEATH.csv")
Deaths_IL = Deaths_df[Deaths_df['CHSI_State_Name'] == 'Illinois']
Death_Infant = Deaths_IL[[
    'CHSI_County_Name', 'LBW', 'VLBW', 'Premature', 'Under_18', 'Over_40',
    'Unmarried', 'Late_Care', 'Infant_Mortality'
]]
Death_Infant = Death_Infant.replace(-1111.1, np.NaN)

Death_Infant = Death_Infant[Death_Infant['Infant_Mortality'].notnull()]

X = np.array(Death_Infant[['Infant_Mortality', 'LBW', 'VLBW']])
model = KMeans(n_clusters=3, random_state=1)
model.fit(X)
pred = model.predict(X)
levels = ['2', '0', '1']
pred_val = [levels[x] for x in pred]

Death_Infant['Death_Level'] = pred_val
X1 = Death_Infant[[
    'LBW', 'VLBW', 'Premature', 'Under_18', 'Over_40', 'Unmarried', 'Late_Care'
]].to_numpy()
y = Death_Infant['Death_Level'].to_numpy()
X_opt = X1[:, [0, 1]]
OLS_res = sm.OLS(endog=Death_Infant['Infant_Mortality'], exog=X_opt).fit()
#print(OLS_res.summary())

X2 = Death_Infant[['LBW', 'VLBW']].to_numpy()
y2 = Death_Infant['Death_Level'].to_numpy()
X2_train, X2_test, y2_train, y2_test = train_test_split(X1,
#Print the output (describe)
print(clustervar.describe())

clus_train, clus_test = train_test_split(clustervar,
                                         test_size=.3,
                                         random_state=123)

# Utilized to identify the k-means cluster analysis. Specifically looking at the range of 1-10 clusters
from scipy.spatial.distance import cdist
clusters = range(1, 11)
meandist = []

for k in clusters:
    model = KMeans(n_clusters=k)
    model.fit(clus_train)
    clusassign = model.predict(clus_train)
    meandist.append(
        sum(
            np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'),
                   axis=1)) / clus_train.shape[0])

##Plotting the average distance from observations from the cluster centroid. Here we are utilizing the Elbow method
##to identify number of clusters to choose. We were told to use 4 in the assignment. The model shows 4 is ideal as well

plt.figure()
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
plt.show()
Beispiel #60
0
#transformatio min max
maxX, maxY = df.max()
minX, minY = df.min()

pointsTransformed = []
for x, y in points:
    pointsTransformed.append([(x - minX) / (maxX - minX),
                              (y - minY) / (maxY - minY)])
dfTransformed = pd.DataFrame(pointsTransformed, columns=["x", "y"])

from sklearn.cluster import KMeans
#centers seems ok
print("Centers")

import matplotlib.pyplot as plt
dfTransformed.plot(x="x", y="y", kind="scatter")
plt.savefig("transformedData.png")

for n in [5, 10, 20, 50]:
    dfTransformed = pd.DataFrame(pointsTransformed, columns=["x", "y"])
    kmeans = KMeans(n_clusters=n).fit(dfTransformed)
    print(kmeans.cluster_centers_)
    dfTransformed['cluster'] = kmeans.predict(dfTransformed)
    dfTransformed.plot(x="x",
                       y="y",
                       c="cluster",
                       kind="scatter",
                       colormap='summer')
    plt.savefig("withClusters-" + str(n) + ".png")