Beispiel #1
0
class NanGroupedModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimator):
        self.estimator = estimator
        self.cluster = KMeans(n_clusters=2)
        self.models = {}

    def fit(self, X, y, **kwargs):
        nans = X.isnull()
        clusters = self.cluster.fit_transform(nans).argmin(axis=1)
        for name in np.unique(clusters):
            filt = clusters == name
            self.models[name] = clone(self.estimator).fit(
                X[filt], y[filt], **kwargs)
        return self

    def predict(self, X, **kwargs):
        preds = np.zeros(X.shape[0])
        nans = X.isnull()
        clusters = self.cluster.transform(nans).argmax(axis=1)
        for name in np.unique(clusters):
            filt = clusters == name
            preds[filt] = self.models[name].predict(X[filt], **kwargs)
        return preds

    def predict_proba(self, X, **kwargs):
        preds = np.zeros(X.shape[0])
        nans = X.isnull()
        clusters = self.cluster.transform(nans).argmax(axis=1)
        for name in np.unique(clusters):
            filt = clusters == name
            preds[filt] = self.models[name].predict(X[filt], **kwargs)
        return preds
Beispiel #2
0
    def train_and_test(self, training_data, test_data, break_point):
        if not self.number_of_clusters:
            self.number_of_clusters = K_Means.find_optimal_K(
                training_data,
                min_number_of_clusters=2,
                max_number_of_clusters=11)

        # Aplica o KMEANS na base de dados de treino
        kmeans = KMeans(n_clusters=self.number_of_clusters,
                        random_state=0).fit(training_data)

        # self.plot_clusters(kmeans, training_data)

        # ACHAR MENORES DISTANCIAS PARA OS DADOS DE TREINO
        train_dist = kmeans.transform(training_data)
        train_min_dist = np.zeros(len(training_data))
        for i in range(len(train_dist)):
            train_min_dist[i] = min(train_dist[i])
        train_min_dist.sort()

        # ACHAR MENORES DISTANCIAS PARA OS DADOS DE TESTE
        test_dist = kmeans.transform(test_data)
        test_min_dist = np.zeros(len(test_dist))
        for i in range(len(test_dist)):
            test_min_dist[i] = min(test_dist[i])

        self.DIs = test_min_dist
        self._train_DIs = train_min_dist
        self._data_break_point = break_point

        self._set_resulting_parameters()
def fit_prompt_type_model(model, n_types, random_state=None, max_dist=0.9, verbosity=0):
	""" 
	Standalone function that fits a prompt type model given paired prompt and response inputs. See docstring of the `PromptTypes` class for details.

	:param model: prompt embedding model (from `fit_prompt_embedding_model()`)
	:param n_types: number of prompt types to infer
	:return: prompt type model
	"""

	if verbosity > 0:
		print('fitting %d prompt types' % n_types)
	km = KMeans(n_clusters=n_types, random_state=random_state)
	km.fit(model['U_prompt'])
	prompt_dists = km.transform(model['U_prompt'])
	prompt_clusters = km.predict(model['U_prompt'])
	prompt_clusters[prompt_dists.min(axis=1) >= max_dist] = -1
	reference_dists = km.transform(model['U_reference'])
	reference_clusters = km.predict(model['U_reference'])
	reference_clusters[reference_dists.min(axis=1) >= max_dist] = -1
	
	prompt_df = pd.DataFrame(index=model['prompt_tfidf_model'].get_feature_names(),
						  data=np.hstack([prompt_dists, prompt_clusters[:,np.newaxis]]),
						  columns=list(range(n_types)) + ['type_id'])
	reference_df = pd.DataFrame(index=model['reference_tfidf_model'].get_feature_names(),
						  data=np.hstack([reference_dists, reference_clusters[:,np.newaxis]]),
						  columns=list(range(n_types)) + ['type_id'])
	return {'km_model': km, 
		   'prompt_df': prompt_df, 'reference_df': reference_df}
Beispiel #4
0
	def gapstat(self, ref_size=10, max_iter=300, n_init=3):


		Wkestrand = np.zeros(len(self.range))
		Wk = np.zeros(len(self.range))
		sk = np.zeros(len(self.range))
		
		sample = self.randomData(ref_size)
		
		

		for indk, k in enumerate(self.range):
			km = KMeans(n_clusters=k, init='k-means++', max_iter=max_iter, n_init=n_init)
			Wkrand = []
			for i in range(ref_size):
				km.fit(sample[i])
				SS = km.transform(sample[i])
				Wkrand.append((self.intraDist(km.labels_.tolist(), k, km.cluster_centers_)))

			Wkestrand[indk] = (1/ref_size)*sum(Wkrand)

			km.fit(self.X)
			XX = km.transform(self.X)
			clusters = km.labels_.tolist()
			Wk[indk] = self.intraDist(clusters, k, km.cluster_centers_)
			sk[indk] = np.sqrt((1/ref_size)*sum([(Wkrand[i]-Wkestrand[indk])**2 for i in range(ref_size)]))

		sk *= np.sqrt(1+1/ref_size)

		Gapk = [(1/ref_size)*Wkestrand[i]-Wk[i] for i in range(len(self.range))]


		#return min([k for k, j in enumerate([Gapk[g]-Gapk[g+1]+sk[g+1] for g in self.range[:,-1]]) if j>0 ])
		return [(k, Gapk[j], Gapk[j]-Gapk[j+1]+sk[j+1])for j, k in enumerate(self.range[:-1])]
    def transform(self, X):
        """
        Computes the predictions.

        @param      X       features.
        @return             prediction
        """
        if self.weights_ is None:
            if self.balanced_predictions:
                labels, distances, __ = constraint_predictions(
                    X, self.cluster_centers_, strategy=self.strategy)
                # We remove small distances than the chosen clusters
                # due to the constraint, we choose max*2 instead.
                mx = distances.max() * 2
                for i, l in enumerate(labels):
                    mi = distances[i, l]
                    mmi = distances[i, :].min()
                    if mi > mmi:
                        # numpy.nan would be best
                        distances[i, distances[i, :] < mi] = mx
                return distances
            return KMeans.transform(self, X)
        else:
            if self.balanced_predictions:
                raise RuntimeError(  # pragma: no cover
                    "balanced_predictions and weights_ cannot be used together."
                )
            res = KMeans.transform(self, X)
            res *= self.weights_.reshape((1, -1))
            return res
Beispiel #6
0
    def __cluster_items(item_embed):
        # index = np.arange(len(item_embed))
        # data = item_embed[index]
        kmeans = KMeans(n_clusters=2, random_state=2020).fit(item_embed)
        labels = kmeans.labels_
        left_index = np.where(labels == 0)[0]
        right_index = np.where(labels == 1)[0]
        # left_index = index[l_i]
        # right_index = index[r_i]
        if len(right_index) - len(left_index) > 1:
            distances = kmeans.transform(item_embed[right_index])[:, 1]
            rank = np.argsort(distances)[::-1]
            idx = np.concatenate((left_index, right_index[rank]))
            mid = len(idx) // 2
            left_index = idx[:mid]
            right_index = idx[mid:]

            # left_index, right_index = Tree.rebalance(
            #     left_index, right_index, distances[:, 1])
        elif len(left_index) - len(right_index) > 1:
            distances = kmeans.transform(item_embed[left_index])[:, 0]
            rank = np.argsort(distances)
            idx = np.concatenate((left_index[rank], right_index))
            mid = len(idx) // 2
            left_index = idx[:mid]
            right_index = idx[mid:]
            # left_index, right_index = Tree.rebalance(
            #     right_index, left_index, distances[:, 0])

        return left_index, right_index, kmeans.cluster_centers_[
            0], kmeans.cluster_centers_[1]
Beispiel #7
0
def get_mmd(perc_value, y_scores, train_1, test_1, y_true, train_ind,
            test_path, bandwidth):

    abn_idx = np.where(y_scores < np.percentile(y_scores, perc_value))
    abn_tst_latent = test_1[abn_idx]
    kmeans = KMeans(n_clusters=1, random_state=0).fit(abn_tst_latent)
    train_1_prime = np.concatenate((train_1, kmeans.transform(train_1)),
                                   axis=1)
    test_1_prime = np.concatenate((test_1, kmeans.transform(test_1)), axis=1)
    cf = svm.OneClassSVM(gamma='scale', nu=0.1)
    cf.fit(train_1_prime[train_ind, :])
    y_scores_tmp_grid = cf.score_samples(test_1_prime)
    y_scores_tmp_grid = (y_scores_tmp_grid - min(y_scores_tmp_grid)) / (
        max(y_scores_tmp_grid) - min(y_scores_tmp_grid))
    auroc = metrics.roc_auc_score(y_true, y_scores_tmp_grid)
    auprc = metrics.average_precision_score(y_true, y_scores_tmp_grid)
    np.save(test_path + '/svm_aucroc1_grid_' + str(perc_value) + '.npy', auroc)
    np.save(test_path + '/svm_aucprc1_grid_' + str(perc_value) + '.npy', auprc)
    abn_idx_left = np.where(y_scores < np.percentile(y_scores, 5))
    abn_idx_right = np.where(y_scores >= np.percentile(y_scores, 80))
    abn_idx_current = np.where(
        (y_scores >= np.percentile(y_scores, perc_value))
        & (y_scores < np.percentile(y_scores, perc_value + 5)))
    mmd_ind_left = np.random.choice(np.squeeze(np.array(abn_idx_left)),
                                    500,
                                    replace=False)
    X_mmd_left = test_1[mmd_ind_left]
    np.save(
        test_path + '/y_true_x_mmd_left_ind_grid_' + str(perc_value) + '.npy',
        y_true[mmd_ind_left])
    mmd_ind_right = np.random.choice(np.squeeze(np.array(abn_idx_right)),
                                     500,
                                     replace=False)
    X_mmd_right = test_1[mmd_ind_right]
    np.save(
        test_path + '/y_true_x_mmd_right_ind_grid_' + str(perc_value) + '.npy',
        y_true[mmd_ind_right])
    mmd_ind_current = np.random.choice(np.squeeze(np.array(abn_idx_current)),
                                       500,
                                       replace=False)
    X_mmd_current = test_1[mmd_ind_current]
    np.save(
        test_path + '/y_true_x_mmd_current_ind_grid_' + str(perc_value) +
        '.npy', y_true[mmd_ind_current])
    mmd_output_left = rbf_mmd2(X_mmd_left,
                               X_mmd_current,
                               sigma=bandwidth,
                               biased=True)
    mmd_output_right = rbf_mmd2(X_mmd_right,
                                X_mmd_current,
                                sigma=bandwidth,
                                biased=True)

    np.save(test_path + '/mmd_grid_left_' + str(perc_value) + '.npy',
            mmd_output_left)
    np.save(test_path + '/mmd_grid_right_' + str(perc_value) + '.npy',
            mmd_output_right)

    return (mmd_output_left, mmd_output_right)
Beispiel #8
0
def getCluster(train_df,test_df,k):
    train_test = pd.concat([train_df.drop('interest_level',axis=1),test_df])
    #processMap(train_test)
    cluster = KMeans(k,random_state = 2333)
    cluster.fit(train_test[['latitude', 'longitude']].dropna())
    train_df['cluster_id_'+str(k)]=cluster.predict(train_df[['latitude', 'longitude']].fillna(-1))
    test_df['cluster_id_'+str(k)]=cluster.predict(test_df[['latitude', 'longitude']].fillna(-1))
    train_df['cluster_id_'+str(k)+'_d']=np.amin(cluster.transform(train_df[['latitude', 'longitude']]),axis=1)
    test_df['cluster_id_'+str(k)+'_d']=np.amin(cluster.transform(test_df[['latitude', 'longitude']]),axis=1)
    def best_lda_cluster_spam(self):
        dh = data_helper()
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        ##
        ## K-Means
        ##
        km = KMeans(n_clusters=4, algorithm='full')
        X_train_transformed = km.fit_transform(X_train_scl)
        X_test_transformed = km.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/spam_kmeans_lda_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)

        ##
        ## GMM
        ##
        gmm = GaussianMixture(n_components=4, covariance_type='full')
        X_train_transformed = km.fit_transform(X_train_scl)
        X_test_transformed = km.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/spam_gmm_lda_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Beispiel #10
0
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

#    sys.stdout = open('a_projpath' +'output.txt','w')
#    print a_driver['DStats']
    
    X = StandardScaler().fit_transform(a_driver['DStats'])
    

    
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
    pca = PCA(n_components=5)
    Xpca = pca.fit(X).transform(X)
    
    if plotflag == True:
        
        fig = scatterplot_matrix(np.transpose(Xpca)
                                                , ['PC1'
                                                , 'PC2'
                                                , 'PC3'
                                                , 'PC4'
#                                                ,'PC5'
                                                ]
                                                ,linestyle='none', marker='o', color='black', mfc='none')
        fig.suptitle('Simple Scatterplot Matrix')
        plt.show()
        

    db = KMeans(n_clusters=1,n_jobs = -1).fit(Xpca)
    
#    db = DBSCAN(eps=0.5).fit(Xpca)
    
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
#    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
#    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(Xpca, labels))
  
    
    print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_)
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    
    return (1- (db.transform(Xpca)/max(db.transform(Xpca))))
Beispiel #11
0
def AnalyzeGraphs(filename):
    """
    Try to generate a graph adjancy matrix, using the edges in each path. Then
    use the matrix to obtain its eigenvectors and then calculate the sum vector
    of the eigenvectors, as a codifier for the graph, Fill an array of the
    resultant vector
    """

    pd_database = pd.read_csv(filename)
    nodes_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

    eigenvectors = []

    for path in pd_database['Steps']:
        path = ast.literal_eval(path)

        eig_vec = GetVectorID(path, nodes_list)
        eig_vec_flat = np.array(eig_vec).flatten()
        #if len(eig_vec) < 10:
        #print(eig_vec_flat)

        eigenvectors.append(eig_vec_flat)

    print('*** Clusteriing %d graphs by K-Means ***' % len(eigenvectors))

    np_eigenvectors = np.array(eigenvectors)
    #print(np_eigenvectors.shape())
    #print('Size:', np_eigenvectors.size())
    kmeans_2 = KMeans(n_clusters=2, random_state=0).fit(np_eigenvectors)
    kmeans_3 = KMeans(n_clusters=3, random_state=0).fit(np_eigenvectors)
    kmeans_4 = KMeans(n_clusters=4, random_state=0).fit(np_eigenvectors)
    kmeans_5 = KMeans(n_clusters=5, random_state=0).fit(np_eigenvectors)

    data_new_2 = kmeans_2.transform(np_eigenvectors)
    data_new_3 = kmeans_3.transform(np_eigenvectors)
    data_new_4 = kmeans_4.transform(np_eigenvectors)
    data_new_5 = kmeans_5.transform(np_eigenvectors)

    with open('k-means2_out.txt', 'w') as outfile2:
        np.savetxt(outfile2, data_new_2, fmt='%4.1f')
    with open('k-means3_out.txt', 'w') as outfile3:
        np.savetxt(outfile3, data_new_3, fmt='%4.1f')
    with open('k-means4_out.txt', 'w') as outfile4:
        np.savetxt(outfile4, data_new_4, fmt='%4.1f')
    with open('k-means5_out.txt', 'w') as outfile5:
        np.savetxt(outfile5, data_new_5, fmt='%4.1f')

    outfile2.close()
    outfile3.close()
    outfile4.close()
    outfile5.close()

    PlotDistanceAngle(np_eigenvectors, kmeans_2.cluster_centers_, data_new_2)
Beispiel #12
0
def preprocess():
    train = pd.read_csv(BASE_PATH + "train.csv").drop(
        ['id'], axis=1)  # ["description"]
    test = pd.read_csv(BASE_PATH + "test.csv").drop(["id"],
                                                    axis=1)  # ["description"]

    sentences = pd.concat([train["description"], test["description"]])

    tokenizer = Tokenizer(
        num_words=2000,
        lower=True,
    )  # 出現頻度上位{num_words}だけを用いる
    tokenizer.fit_on_texts(sentences)

    train_X, test_X = np.split(tokenizer.texts_to_matrix(sentences,
                                                         mode='binary'),
                               [len(train)],
                               axis=0)

    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        analyzer="char",
        stop_words="english",
        ngram_range=(2, 6),
        max_features=1000,
    )
    word_vectorizer.fit(sentences)
    # print(train_X)
    # print((word_vectorizer.transform(train["description"])).toarray())

    train_X = np.concatenate(
        [train_X,
         (word_vectorizer.transform(train["description"])).toarray()], 1)
    test_X = np.concatenate(
        [test_X,
         (word_vectorizer.transform(test["description"])).toarray()], 1)

    text_svd = TruncatedSVD(n_components=100,
                            algorithm="arpack",
                            random_state=1234)
    text_svd.fit(train_X)
    train_X = text_svd.transform(train_X)
    test_X = text_svd.transform(test_X)

    kmeans = KMeans(n_clusters=100,
                    random_state=10).fit(np.concatenate([train_X, test_X]))
    train_X = np.concatenate([train_X, (kmeans.transform(train_X))], 1)
    test_X = np.concatenate([test_X, (kmeans.transform(test_X))], 1)

    train_y = train['jobflag'].values - 1  # maps {1, 2, 3 ,4} -> {0, 1, 2, 3}
    return train_X, train_y, test_X
Beispiel #13
0
def cluster_feature_nn(df, X, y, clusters=2):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)
    kmeans_model = KMeans(n_clusters=clusters, random_state=100).fit(X_train)
    features_train = kmeans_model.transform(X_train)
    features_test = kmeans_model.transform(X_test)
    nn_model = MLPClassifier(hidden_layer_sizes=(20, 20),
                             activation='relu',
                             max_iter=700)
    nn_model.fit(features_train, y_train)
    train_score = nn_model.score(features_test, y_test)
    test_score = nn_model.score(features_train, y_train)
    return train_score, test_score
Beispiel #14
0
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Beispiel #15
0
def kmeans(data, model_id, x_col, n_clusters):

    # |Create model, fit data, and return prediction of cluster for each row
    model = KMeans(n_clusters)

    # |Add distance to each cluster for each row to summary data
    headers = []
    for i in range(n_clusters):
        headers.append('dist_%s' % str(i))
    dist = pd.DataFrame(model.transform(data.x), columns=headers)
    data.current_df = data.current_df.join(dist)

    data.df['kmeans']['data'] = data.df['kmeans']['data'].append(data.current_df, ignore_index=True)

    # |Create DataFrame with each cluster and the mean value for each input column
    df = pd.DataFrame()
    for i in range(n_clusters):
        clus = {'cluster':i}
        for j in range(len(x_col)):
            clus['%s_mean' % x_col[j]] = model.cluster_centers_[i][j]
        df = df.append(clus, ignore_index=True)
    df['model_id'] = model_id
    data.df['kmeans']['clusters'] = data.df['kmeans']['clusters'].append(df, ignore_index=True)

    return data, model
class KMeansClustering(Transform):
    """KMeans clustering

    Uses the KMeans implementation of sklearn.
    """
    base_name = "kmeans"

    def __init__(self, config):
        """Kmeans-clustering constructor"""
        Transform.__init__(self, config)
        self.transformer = KMeans(self.dimension)
        self.process_func_train = self.fit
        self.process_func_test = self.do_transform

    def fit(self, data):
        self.transformer = self.transformer.fit(data)
        return self.do_transform(data)

    def do_transform(self, data):
        res = self.transformer.transform(data)
        return res

    def get_term_representations(self):
        """Return term-based, rather than document-based representations
        """
        return self.transformer.cluster_centers_
Beispiel #17
0
def components(K):
    Sum_of_squared_distances = []
    k = []
    accuracy_train = []
    accuracy_test = []
    score = []
    for i in range(1, K):
        print(i)
        agglo = KMeans(n_clusters=i)
        #X_new_train,y_new_train=transformer.fit(X_train,y_train)
        #X_new_test,y_new_test = transformer.transform(X_test,y_test)
        agglo.fit(X)
        X_reduced = agglo.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,
                                                            y,
                                                            test_size=0.20)
        km = MLPClassifier(solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=[8, 8, 8, 8, 8],
                           random_state=1)
        km.fit(X_train, y_train)
        km.fit(X_test, y_test)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_compo
        label_train = km.predict(X_train)
        label_test = km.predict(X_test)
        accu_train = km.score(X_test, y_test)
        accu_test = km.score(X_train, y_train)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        #Sum_of_squared_distances.append(km.inenents=i,eps=0.6)
        #label=transformer.predicn)rtia_)
        k.append(i)
        accuracy_train.append(accu_train)
        accuracy_test.append(accu_test)
        #score.append(score_train1)
        #print(accuracy)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    accuracy_train = np.array(accuracy_train)
    accuracy_test = np.asarray(accuracy_test)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3, = plt.plot(k,
                      accuracy_train,
                      color='r',
                      marker='o',
                      label='train_accuracy')
    line4, = plt.plot(k,
                      accuracy_test,
                      color='g',
                      marker='o',
                      label='test_accuracy')
    #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.xlabel('k')
    plt.legend()
    plt.ylabel('accuracy')
    #plt.ylim(0,1)
    plt.show()
    return None
Beispiel #18
0
    def test_basic(self, Xl_blobs_easy):
        X, _ = Xl_blobs_easy

        # make it super easy to cluster
        a = DKKMeans(n_clusters=3, random_state=0)
        b = SKKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(
            a,
            b,
            exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"])
        assert abs(a.inertia_ - b.inertia_) < 0.01
        # order is arbitrary, so align first
        a_order = np.argsort(a.cluster_centers_, 0)[:, 0]
        b_order = np.argsort(b.cluster_centers_, 0)[:, 0]
        a_centers = a.cluster_centers_[a_order]
        b_centers = b.cluster_centers_[b_order]
        np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3)
        b_labels = replace(b.labels_, [0, 1, 2],
                           a_order[b_order]).astype(b.labels_.dtype)
        assert_eq(a.labels_.compute(), b_labels)
        assert a.n_iter_
        # this is hacky
        b.cluster_centers_ = b_centers
        a.cluster_centers_ = a_centers
        assert_eq(a.transform(X), b.transform(X), rtol=1e-3)

        yhat_a = a.predict(X)
        yhat_b = b.predict(X)
        assert_eq(yhat_a.compute(), yhat_b)
Beispiel #19
0
def kmean_data(tune_path=None, test_path=None, cluster=3, isPCA=True):
    '''
    :param tune_path: src of a tuning data set
    :param test_path: src of a testing data set
    :return: tuning data after clustering, in the form of [indep val,
    depen val]
    '''

    def find_min(a):
        return a.min()

    if not tune_path:
        tune_path = "./data/ant/ant-1.4.csv"
    if not test_path:
        test_path = "./data/ant/ant-1.5.csv"
    df_tune = get_data(tune_path, "tune")
    df_test = get_data(test_path, "test")
    if isPCA:
        tune_x, tune_y = pca_analysis(df_tune)
        test_x, test_y = pca_analysis(df_test)
    else:
        tune_x, tune_y = get_xy(df_tune, normalize=True)
        test_x, test_y = get_xy(df_test, normalize=True)
    # tune_x, tune_y = get_xy(df_tune, normalize=True)
    # test_x, test_y = get_xy(df_test, normalize=True)
    kmean = KMeans(n_clusters=cluster).fit(
        test_x)  ## use testing data to do clustering
    avg_distance = kmean.inertia_ / float(len(test_x))
    tune_distance = kmean.transform(tune_x)
    min_distance = np.apply_along_axis(find_min, 1, tune_distance)
    pick_index = min_distance < avg_distance * 2  # find tuning data whose
    # all distance to cluster center is less than avg_distance
    normal_tune_x, normal_tune_y = get_xy(df_tune, normalize=False)
    _tune_x, _tune_y = normal_tune_x[pick_index], normal_tune_y[pick_index]
    return [_tune_x, _tune_y]
def recommend(X, x, user, threshold=5):
    model = KMeans(n_clusters=2, random_state=0)
    print("X   ", X)
    model.fit(X[:, 1:])
    frame = pd.DataFrame(X)
    frame['cluster'] = model.predict(X[:, 1:])
    k = model.predict(user.reshape(1, -1)[:, 1:])
    print("K", k)
    frame.columns = [
        'User_id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
        'cluster'
    ]
    print(frame)
    data = frame[frame["cluster"] == k[0]].iloc[:, 1:11]
    print(" ", data)
    distances = model.transform(data)
    required_distances = distances[:, k].reshape(distances.shape[0])
    idx = (-required_distances).argsort()[:threshold]
    print(required_distances)
    # return user_name with the help of idx
    print(len(data.iloc[idx, :].index))
    print("X", x)
    list_recommended_users = []
    list_index = []
    for i in range(len(data.iloc[idx, :].index)):
        index = data.iloc[idx, :].index[i]
        list_index.append(index)
        print("series", frame.iloc[index, 0])
        list_recommended_users.append(int(x[index, 0]))
    print(list_recommended_users)
    return list_recommended_users, list_index


# recommend(X,y,model_kmeans,0,2)
def init_cluster(word_vectors):
    print(word_vectors.vectors)
    model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
    labels = model.labels_
    silhouette_score = metrics.silhouette_score(word_vectors.vectors, labels, metric='euclidean')
    print("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
    print(model.score(word_vectors.vectors))
    print("Silhouette_score: ")
    print(silhouette_score)
    print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[2], topn=50, restrict_vocab=None))
    y_kmeans = model.predict(word_vectors.vectors)
    plt.scatter(word_vectors.vectors[:, 0], word_vectors.vectors[:, 1], c=y_kmeans, s=50, cmap='viridis')
    centers = model.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
    words = pd.DataFrame(word_vectors.vocab.keys())
    words.columns = ['words']
    words = words[words['words'].str.len() > 3].reset_index(drop=True)
    words['vectors'] = words['words'].apply(lambda x: word_vectors.wv[f'{x}'])
    words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)]))
    words['cluster'] = words['cluster'].apply(lambda x: x[0])
    words['cluster_value'] = [1 if i == 0 else -1 if i == 1 else 0 for i in words['cluster']]
    words['closeness_score'] = words.apply(lambda x: 1 / (model.transform([x['vectors']]).min()), axis=1)
    words['sentiment_coeff'] = words['closeness_score'] * words['cluster_value']
    words.to_csv('metrics_results\\predictive_scores_{}.csv'.format(time_stamp), index=False)
    return words
Beispiel #22
0
def get_features(nb_peaks, X, Y):
    data = np.empty((1, 10))
    kmeans = KMeans(n_clusters=nb_peaks, random_state=1).fit(X)
    sample_silhouette_values = silhouette_samples(X, kmeans.labels_)
    a = kmeans.transform(X)
    a = np.take_along_axis(a, kmeans.labels_.reshape(-1, 1), axis=1)
    indexes = np.sort(np.unique(kmeans.labels_, return_index=True)[1])
    for unique, center in zip(kmeans.labels_[indexes],
                              np.sort(kmeans.cluster_centers_,
                                      axis=0).ravel()):
        group = a[kmeans.labels_ == unique]
        group_y = Y[kmeans.labels_ == unique]
        group_x = X[kmeans.labels_ == unique]
        sil = sample_silhouette_values[kmeans.labels_ == unique].mean()
        sample = np.array([
            group.std(), group.shape[0], center,
            group_x.mean(), sil,
            group.sum(),
            group_x.std(),
            group.mean(),
            group_y.mean(),
            group_y.std()
        ]).reshape(1, 10)
        data = np.concatenate((data, sample), axis=0)
    data = data[1:, :]
    centers = np.sort(kmeans.cluster_centers_.ravel()).reshape(1, -1)
    return data, centers
Beispiel #23
0
    def sample_tag_with_kmeans(self, batch, topk_tag=1000, n_cluster=3):
        src_inputs = batch.src[0]
        src_lengths = batch.src[1].tolist()
        context, enc_states = self.model.encode(src_inputs, src_lengths)
        sampler_output = self.model.sample_tag(src_inputs, src_lengths)
        sampler_output.data[0] = -1e20
        tag_log_probs = sampler_output

        selected_tag_score, selected_tag_pos = tag_log_probs.data.topk(
            topk_tag, dim=-1)

        # selected_tag = tag_inputs[selected_tag_pos[-1]].unsqueeze(-1)
        tag_hidden = self.model.tag_encode(
            Variable(selected_tag_pos).unsqueeze(0)).squeeze(0)
        tag_c = self.model.seq2seq.decoder.cal_tag_atten(tag_hidden, context)
        tag_c = tag_c.data.cpu().numpy()
        clf = KMeans(n_clusters=n_cluster, init='k-means++', max_iter=300)
        clf.fit(tag_c)
        distance = clf.transform(tag_c)
        np_topk_tag_idx = selected_tag_pos.tolist()
        np_topk_tag_score = selected_tag_score.tolist()
        clusters = [[] for _ in range(n_cluster)]
        for i, (data, log_prob,
                c) in enumerate(zip(np_topk_tag_idx, np_topk_tag_score,
                                    tag_c)):

            clusters[clf.labels_[i]].append(
                (data, log_prob, i, distance[i][clf.labels_[i]]))

        for idx, cluster in enumerate(clusters):
            clusters[idx] = sorted(cluster, key=lambda x: -(x[3]))
        return clusters
Beispiel #24
0
def initiateBeta(num_of_variables, num_of_clusters, initialData):
    """
    initiate beta
    """
    #####simplex lattice design points
    weights = np.loadtxt("SLD5.txt", delimiter=" ")
    center = np.array(
        [[1.0 / num_of_variables for i in range(num_of_variables)]])
    weights = np.concatenate((weights, center), axis=0)

    weights *= num_of_variables
    sqrtWeights = np.sqrt(weights)
    ys = []
    for weight in sqrtWeights:
        estimator = KMeans(num_of_clusters)
        data = initialData * weight
        estimator.fit_predict(data)
        minDistance = np.min(estimator.transform(data), 1)
        square = np.power(minDistance, 2)
        sum_of_squares = np.sum(square)
        meanSquareDistance = sum_of_squares / (len(square) - 1)
        ys.append(meanSquareDistance)

    npys = np.array(ys)

    beta = lstsq(weights, npys)[0]

    return beta
Beispiel #25
0
 def refine_centers(self, resultlist):
     '''
     in this step, each cluster should be taken seriously, the algorithm could find centers which are close to each other,
     and combine to a single center
     the recommented input is the output of find_states_kmeans_step2()
     :param resultlist: a list contains dicts
             rach member in the dict is a cluster, described by a dict{"index","value","members","average_inertia"}
     :return:the kmeans object contains the refined cluster centers
     '''
     threshold = 0.065
     tocluster = [i["value"] for i in resultlist]
     maxclusters = len(resultlist)
     minstates = 2
     if (maxclusters == 1):
         # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         minstates = 1
     for n_clusters in range(minstates, maxclusters + 1):
         clusterer = KMeans(n_clusters=n_clusters, random_state=10)
         if (maxclusters == 1):
             clusterer = KMeans(n_clusters=1, random_state=10)
             clusterer.fit(tocluster)
             return clusterer
         clusterer.fit(tocluster)
         loss = clusterer.transform(tocluster)
         for i in loss:
             i.sort()
             if (i[0] > threshold * i[1]):
                 # the n_clusters should be updated
                 break
             return clusterer
     warnings.warn('zhai: all cluster centers are close to each other')
     return clusterer
Beispiel #26
0
def runKmeans(X, cluster_range):
    centroids = []
    cld = []
    clparms = []
    basedist = 0
    clustdist = []
    interias = []
    distortions = []
    silhout = []
    labelss = []
    #
    # Run clustering for K=1 to K=9, save results
    #
    #k = range(2,9)
    for i in cluster_range:
        # kmeans is an object of KMeans class, bellow KMeans is the constructor.
        kmeans = KMeans(n_clusters=i, n_init=30)
        #Compute k-means clustering

        kmeans.fit(X)
        label = kmeans.labels_
        labelss.append(label)
        #distortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_,'euclidean'),axis=1)) / X.shape[0])

        #      distortions.append(np.average(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)))
        pdist = []
        # Transform X to a cluster-distance space.
        alldist = kmeans.transform(X)
        clustdist.append(alldist)
        #Coordinates of cluster centers
        centroid = kmeans.cluster_centers_
        centroids.append(centroid)
        #Predict the closest cluster each sample in X belongs to.
        labels = kmeans.predict(X)
        #label = kmeans.labels_
        #silhout.append(silhouette_score(X,label,metric='euclidean'))
        # distortions.append(np.sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0])
        # Sum of squared distances of samples to their closest cluster center.
        interia = kmeans.inertia_
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters

        # This condition to avoid error when # of clusters is 1
        # It assumes that the silhouette value when k=1 is 1
        if i == 1:
            silhouette_avg = 1
        else:
            silhouette_avg = silhouette_score(X, labels, metric='euclidean')
        silhout.append(silhouette_avg)
        #cross = pd.crosstab(X,labels)

        interias.append(interia)
        if basedist == 0:
            basedist = interia
        cld.append(labels)
        clparms.append(interia / basedist)

    return (cld, clustdist, clparms, interias, distortions, silhout, centroids,
            labelss)
Beispiel #27
0
    def KnnClassify(self,candi):
        words = self.extracAllword(candi)
        word_dict = {w:idx for idx, w in enumerate(words)}
        x = [[0 for _ in xrange(len(words))] for _ in xrange(len(candi))]
        if len(x) < 3:
            return candi
        for id, s in enumerate(candi):
            tmp = self.text_to_vector(s)
            for k,v in tmp.items():
                x[id][word_dict[k]] = float(v)

        km = KMeans(n_clusters=3)
        km.fit(x)
        samples = {}
        X_new = km.transform(x)
        # try:
        #     X_new = km.transform(x)
        # except:
        #     print 'mooo'
        for idx, l in enumerate(km.labels_):
            try:
                samples[l][idx] = X_new[idx][l]
            except:
                samples[l] ={}
                samples[l][idx] = X_new[idx][l]
        ret = []
        for k, v in samples.items():
            sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True)
            for it in sortedv:
                ret.append(candi[it[0]])
        return ret
Beispiel #28
0
    def __init__(self, X, n_clusters, n_init=3, method="kmcuda"):
        if method == "kmcuda":
            self.inertia = np.inf
            for _ in range(n_init):
                centers, y_pred = kmeans_cuda(X.astype(np.float32), n_clusters)
                full_idx = np.arange(len(X))
                centroids_idxs = []
                inertia = 0
                for i in range(n_clusters):
                    idx = full_idx[y_pred == i]
                    if len(idx) != 0:
                        X_sub = X[idx]
                        norm = la.norm(X_sub - centers[i], axis=1)
                        min_idx = norm.argmin()
                        centroids_idxs.append(idx[min_idx])
                        inertia += np.sum(norm)
                    else:
                        centroids_idxs.append(0)
                centroids_idxs = np.array(centroids_idxs)

                if inertia < self.inertia:
                    self.centers = centers
                    self.y_pred = y_pred
                    self.centroids_idxs = centroids_idxs
        elif method == "sklearn":
            km = KMeans(n_clusters, n_init=n_init)
            self.y_pred = km.fit_predict(X)
            self.centers = km.cluster_centers_
            self.centroids_idxs = km.transform(X).argmin(axis=0)
        else:
            raise NotImplementedError
Beispiel #29
0
def inertia_clustering_analysis(ds, max_clusters=13):

    inertia_val = np.array([])

    #max_clusters = 13#+2 = 15
    for i in np.arange(max_clusters) + 2:
        kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
        kmeans.transform(ds.samples)
        inertia_val = np.append(inertia_val, kmeans.inertia_)

    f = plt.figure()
    a = f.add_subplot(111)
    a.plot(inertia_val)
    plt.show()

    return inertia_val
Beispiel #30
0
def kmeanFinal(arr, K, rand_state):
	kmeans = KMeans(n_clusters=K, random_state=rand_state).fit(arr)
	kmeans_transform = kmeans.transform(arr)
# 	km = KMeans(n_clusters=20, random_state=1)
# 	distances = kmeans.fit_transform(arr)
# 	print(distances)
	
	
	# iVal = total counts of vector points
    # label = centroid index
	centroids = kmeans.cluster_centers_
	labels = kmeans.labels_
	inertia = kmeans.inertia_
	
	iVal=0
	varianceVal = 0
	retVal = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	retCount = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	for label in kmeans.labels_:
		print(label)
		varianceVal = varianceVal + kmeans_transform[iVal][label]*kmeans_transform[iVal][label]
		retVal[label] += kmeans_transform[iVal][label]*kmeans_transform[iVal][label] 
		retCount[label] += 1
		iVal = iVal + 1
		
	return centroids , labels , inertia, kmeans_transform, iVal, varianceVal , retVal , retCount
Beispiel #31
0
    def calculate_silhouette_score(self,
                                   best_score: int = -1,
                                   k_range: Tuple[int, int] = (2, 20)) -> int:
        """
        Calculate the best number of clusters via Silhouette score method
        @param best_score: best score to start from
        @param k_range: possible range of cluster numbers
        @return: best quantity of clusters
        """
        if k_range[0] < 2:
            _temp_k_range = list(k_range)
            _temp_k_range[0] = 2
            k_range = tuple(_temp_k_range)
        for k in range(*k_range):
            model = KMeans(
                n_clusters=k,
                init="k-means++",
                max_iter=500,
                n_init=100,
                n_jobs=-1,
                algorithm="full",
            )
            model.fit(self.matrix)
            labels = model.predict(self.matrix)
            score = silhouette_score(model.transform(self.matrix), labels)

            if score > best_score:
                self.best_k = k
                best_score = score
            print(
                f"Current cluster: {k}, silhouette score: {score} (current best K: {self.best_k})"
            )
        print(f"The best K number is: {self.best_k}")
        return self.best_k
Beispiel #32
0
def get_dist_graph(all_points, num_anchor=300):
    """
    get the cluster center as anchor by K-means++
    and calculate distance graph (n data points vs m anchors),
    :param all_points: n data points
    :param num_anchor:  m anchors, default = 300
    :return: distance graph n X m
    """
    # kmeans = KMeans (n_clusters=num_anchor, random_state=0, n_jobs=16, max_iter=50).fit_transform(all_points)
    # print ('dist graph done!')
    # return np.asarray(kmeans)
    ## smaple

    num_data = np.size(all_points, 0)
    sample_rate = 3000
    # sample_rate = num_data
    ind = random.sample(range(num_data), sample_rate)
    sample_points = all_points[ind, :]
    kmeans = KMeans(n_clusters=num_anchor,
                    random_state=0,
                    n_jobs=16,
                    max_iter=50).fit(sample_points)
    km = kmeans.transform(all_points)
    print('dist graph done!')
    return np.asarray(km)
Beispiel #33
0
def post_cluster(url, id, tfidf_vec):
    from sklearn.cluster import KMeans
    kmean = KMeans(n_clusters=300)
    print("kmeans")
    kmean.fit(tfidf_vec)
    pred = kmean.transform(tfidf_vec)

    count1 = 0
    count2 = 0
    pred_str = []

    for item in pred:
        count1 += 1
        vec = ""
        for tmp in item:
            vec += str(tmp)[0:7] + "\t"
        pred_str.append(vec)

        print(len(pred_str))
        print(len(id))

    pred = kmean.predict(tfidf_vec)
    fo = open(url + "/cluster.txt", "a+")
    for i in range(len(pred)):
        count2 += 1
        fo.write(id[i] + "\t" + str(pred[i]) + "\n")
    fo.close()
    print("%d+%d" % (count1, count2))
Beispiel #34
0
    def clustering(self, k):
        word_vectors = self.__model_p__.wv
        KM_model = KMeans(n_clusters=k,
                          max_iter=1000,
                          random_state=True,
                          n_init=50).fit(X=word_vectors.vectors)

        center_closest = []
        for i in range(k):
            center_closest.append([
                el[0] for el in word_vectors.similar_by_vector(
                    KM_model.cluster_centers_[i], topn=15, restrict_vocab=None)
            ])

        metric_str = 'euclidean'
        score = silhouette_score(word_vectors.vectors,
                                 KM_model.predict(word_vectors.vectors),
                                 metric=metric_str)
        print("silhouette_score:", score)

        SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True)
        SVmodel.fit(word_vectors.vectors)
        SVmodel.show()
        words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words'])
        words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
        words['cluster'] = words.vectors.apply(
            lambda x: KM_model.predict([np.array(x)]))
        words.cluster = words.cluster.apply(lambda x: x[0])
        words['closeness_score'] = words.apply(
            lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1)

        return KM_model, center_closest, score, words
Beispiel #35
0
def do_kmeans(X=None, n_clusters=None, articles_df=None, features=None):
    kmeans = KMeans(n_clusters=n_clusters, verbose=True)
    kmeans.fit(X)

    assigned_cluster = kmeans.transform(X).argmin(axis=1)
    print assigned_cluster
    print 'kmeans_class dist:', Counter(assigned_cluster)

    articles_df['kmeans.text_' + str(n_clusters).zfill(3)] = assigned_cluster
    top_centroids = kmeans.cluster_centers_.argsort()[:, -1:-20:-1]
    print 'top centroids:\n', top_centroids

    cl = []
    for num, centroid in enumerate(top_centroids):
        cl.append([num, [", ".join(features[i] for i in centroid)]])

    l = pd.DataFrame(cl)
    l.columns = ['kmeans.text_' + str(n_clusters).zfill(3), 'features']
    print l
    articles_df = pd.merge(l, articles_df)
    print 'n,inertia', n_clusters, kmeans.inertia_
    writecols = ['symbol', 'gics8', 'kmeans.text_' + str(n_clusters).zfill(3)]
    articles_df[writecols].to_csv('../data/kmeans_text.' +
                                  str(n_clusters).zfill(3) + '.csv',
                                  index=False)
    for i in range(kmeans.n_clusters):
        cluster = np.arange(0, X.shape[0])[assigned_cluster == i]
        #print cluster
        ss = articles_df.loc[articles_df['kmeans.text_' +
                                         str(n_clusters).zfill(3)] == i]
        ss.sort('mc', ascending=False, inplace=True)
        print "cluster {}:".format(i)
        print ss[['name', 'mc', 'gics8']][0:10]
        print ss.iloc[0][['features']].values.tolist()
    print articles_df.info(verbose=True, null_counts=True)
Beispiel #36
0
def inertia_clustering_analysis(ds, max_clusters=13):

    inertia_val = np.array([])

    #max_clusters = 13#+2 = 15
    for i in np.arange(max_clusters)+2:
        kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
        kmeans.transform(ds.samples)
        inertia_val = np.append(inertia_val, kmeans.inertia_)

    f = plt.figure()
    a = f.add_subplot(111)
    a.plot(inertia_val)
    plt.show()

    return inertia_val
Beispiel #37
0
def k_means(fname, dim=3, cluster_num=5, show_img=False):
    '''
    Function to cluster the data into cluster_num groups and visualize them in a 3D space
    :param fname: the path of the pca_table file
    :param dim: the number of dimensions we want to use
    :param cluster_num: the number of clusters we want to cluster them
    :param show_img: show k-means image or not
    :return: NONE
    '''
    assert isinstance(fname, str)
    data, names = pca_processing(fname, dim)
    X = np.array(data)
    k_means = KMeans(n_clusters=cluster_num).fit(X)
    labels = k_means.labels_
    fig = plt.figure(1, figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    ax.scatter(X[:, 1],
               X[:, 0],
               X[:, 2],
               c=labels.astype(np.float),
               edgecolor='k')
    if show_img is True:
        fig.show()
    distance = k_means.transform(X)
    return names, labels, distance
Beispiel #38
0
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10,
                  confidence = 0.90):
    '''
    Computes the BetaCV for running Kmeans on the dataset. This method
    returns the BetaCV value and half of the size of the confidence interval
    for the same value (BetaCV is an average or the number of runs given).
    
    Arguments
    ---------
    data: matrix
        A matrix of observations. If this is sparse, `batch_kmeans` must 
        be True
    num_cluster: int 
        number of clusters to run k-means for
    batch_kmeans: bool (defauts to False)
        if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster
        and suitable for sparse datasets, but less accurate.
    n_runs: int (default = 10)
        Number of runs to compute the BetaCV
    confidence: double [0, 1) (default = 0.9)
        The confidence used to compute half the confidence interval size
    
    Returns
    -------
    The betacv and half of the confidence interval size
    '''
    algorithm = None
    if not batch_kmeans:
        algorithm = KMeans(num_cluster)
    else:
        algorithm = MiniBatchKMeans(num_cluster)
    
    inter_array = np.zeros(n_runs)
    intra_array = np.zeros(n_runs)
    for i in xrange(n_runs):
        #Run K-Means
        algorithm.fit(data)
        
        centers = algorithm.cluster_centers_
        labels = algorithm.labels_
        
        #KMeans in sklearn uses euclidean
        dist_centers = pairwise.euclidean_distances(centers)
        
        #Inter distance
        mean_dist_between_centers = np.mean(dist_centers)
        inter_array[i] = mean_dist_between_centers

        #Intra distance
        dist_all_centers = algorithm.transform(data)
        intra_dists = []
        for doc_id, cluster in enumerate(labels):
            dist = dist_all_centers[doc_id, cluster]
            intra_dists.append(dist)
        intra_array[i] = np.mean(intra_dists)
    
    betacv = intra_array / inter_array
    cinterval = half_confidence_interval_size(betacv, confidence)
    return np.mean(betacv), cinterval
def cluster_encode(X_train, X_test, codebook='kmeans', k=25):
    if codebook == 'kmeans':
        cb = KMeans(k, n_init=1, init='random')
    elif codebook == 'gmm':
        cb = GMM(n_components=k)
    X = np.vstack((X_train, X_test))
    X = StandardScaler().fit_transform(X)
    print('_' * 80)
    print('fitting codebook')
    print
    print cb
    print
    cb.fit(X)
    print 'fin.'
    X_train = cb.transform(X_train)
    X_test = cb.transform(X_test)
    return X_train, X_test
Beispiel #40
0
def compute_clusters(topics, match):
    recipe_topics = topics['W'][match, :]
    cluster = KMeans(n_clusters=4)
    # cluster = AffinityPropagation()
    cluster.fit(recipe_topics)
    distances = cluster.transform(recipe_topics)

    return cluster, distances
Beispiel #41
0
    def _cluster(self, index):
        data = self.data[index]
        kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
        labels = kmeans.labels_
        l_i = np.where(labels == 0)[0]
        r_i = np.where(labels == 1)[0]
        left_index = index[l_i]
        right_index = index[r_i]
        if len(right_index) - len(left_index) > 1:
            distances = kmeans.transform(data[r_i])
            left_index, right_index = self._rebalance(
                left_index, right_index, distances[:, 1])
        elif len(left_index) - len(right_index) > 1:
            distances = kmeans.transform(data[l_i])
            left_index, right_index = self._rebalance(
                right_index, left_index, distances[:, 0])

        return left_index, right_index
def test_transform():
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    X_new = km.transform(km.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_greater(X_new[c, c2], 0)
Beispiel #43
0
def test_transform():
    k_means = KMeans(k=n_clusters)
    k_means.fit(X)
    X_new = k_means.transform(k_means.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_true(X_new[c, c2] > 0)
Beispiel #44
0
def run_k_means(df, numberclusters, geoidlabel ='geoid10', plot_silouette = True):
	'''Uses sklearn to run kmeans. 
	
	ARGUMENTS:
	1) df: A dataframe with a geoid column
	2) geoidlabel: the label of the geoid column. 
	3) plot_silouette: whether or not to plot the silouettes of each cluster
	
	OUTPUT: Returns a three part tuple:
	1) the kmeans sklearn model 
	2) a dictionary with geoids as the key, and the cluster as the value
	3) a dictionary with clusters as the key, and a list of related geoids as the value'''

	#Use K means to cluster the dataset. 
	x = df[['wkday_0','wkday_1','hrbin_morning',
	        'hrbin_afternoon','hrbin_evening',
	         'hrbin_latenight','hrbin_dawn']].values
	kmeans = KMeans(n_clusters = numberclusters)
	kmeans.fit(X = x )
	features = df.columns.tolist()[1:]
	geoids = df[geoidlabel]

	#store values in a dictionary
	geoid_dict = defaultdict(int)
	cluster_dict = defaultdict(list)

	#Transforms x into a cluster-distance space. 
	#In this array, each column is a cluster with the value of the distance from 
	#a given neighborhood block (geoid) in each row. 
	#This function returns the cluster belonging to each neighborhood block:
		#the cluster with the smallest distance value 
	assigned_cluster = kmeans.transform(x).argmin(axis=1)

	for i in range(kmeans.n_clusters):
	    cluster = np.arange(0, x.shape[0])[assigned_cluster==i]
	    geoids = [df.ix[geoindx]['hrbin_'] for geoindx in cluster]
	    print len(geoids), 'cluster #', i
	    #make a dictionary with cluster as the key, and geoids as the list
	    cluster_dict[i] = geoids
	    #second dictionary to quickly look up what cluster each geoid belongs to
	    for geo in geoids:
	        geoid_dict[geo] = i
	if  plot_silouette == True:
	    plot_cluster_silouette_values(X, assigned_cluster, n_clusters)

	#save the dictionaries as CSVs
	save_dictionary_as_csv(cluster_dict, 'data/intermediate_data/kmeans/kmeans_clusterdict.csv')
	save_dictionary_as_csv(geoid_dict, 'data/intermediate_data/kmeans/kmeans_geoiddict.csv')

	return kmeans, geoid_dict, cluster_dict
def cluster_documents(n_clusters, doc_term_matrix):
    kmeans = KMeans(n_clusters=n_clusters)

    kmeans = kmeans.fit(doc_term_matrix)

    distances = kmeans.transform(doc_term_matrix)

    results = distances.argmin(axis=1)

    clusters = defaultdict(list)

    for document_index, cluster in enumerate(results):
        clusters[cluster].append((document_index, distances[document_index, cluster]))
        
    return clusters
def get_kmeans_features(tr_all,ts_all,n_clusters=3,normz=None,axis=0):
    tr_ids,tr_x_orig,tr_y = tr_all
    ts_ids,ts_x_orig = ts_all

    tr_x = np.copy(tr_x_orig)
    ts_x = np.copy(ts_x_orig)

    kmeans = KMeans(n_clusters)
    kmeans.fit(np.append(tr_x,ts_x,axis=0))

    tf_tr_x = kmeans.transform(tr_x)
    tf_ts_x = kmeans.transform(ts_x)

    tf_tr_x,tf_ts_x = normalize_data(tf_tr_x,tf_ts_x,normz,axis)

    return (tr_ids,tf_tr_x,tr_y),(ts_ids,tf_ts_x)
  def fit(self, X, Y=None):
    if self.method == 'random':
      N = len(X)
      idx = np.random.randint(N, size=self.M)
      self.samples = X[idx]
    elif self.method == 'normal':
      # just sample from N(0,1)
      D = X.shape[1]
      self.samples = np.random.randn(self.M, D) / np.sqrt(D)
    elif self.method == 'kmeans':
      X, Y = self._subsample_data(X, Y)

      print("Fitting kmeans...")
      t0 = datetime.now()
      kmeans = KMeans(n_clusters=len(set(Y)))
      kmeans.fit(X)
      print("Finished fitting kmeans, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      # we will do this by finding the distance between each point
      # and all cluster centers
      # and return which points have the smallest variance
      dists = kmeans.transform(X) # returns an N x K matrix
      variances = dists.var(axis=1)
      idx = np.argsort(variances) # smallest to largest
      idx = idx[:self.M]
      self.samples = X[idx]
    elif self.method == 'gmm':
      X, Y = self._subsample_data(X, Y)

      print("Fitting GMM")
      t0 = datetime.now()
      gmm = GaussianMixture(
        n_components=len(set(Y)),
        covariance_type='spherical',
        reg_covar=1e-6)
      gmm.fit(X)
      print("Finished fitting GMM, duration:", datetime.now() - t0)

      # calculate the most ambiguous points
      probs = gmm.predict_proba(X)
      ent = stats.entropy(probs.T) # N-length vector of entropies
      idx = np.argsort(-ent) # negate since we want biggest first
      idx = idx[:self.M]
      self.samples = X[idx]
    return self
Beispiel #48
0
 def clusterize_kmeans(self, *args, **kwargs):
     """
     Cluster hosts using KMeans algorithm
     n_clusters : the number of clusters to form
     Update self._clusters attribute
     """
     # Launch KMeans algorithm with all available CPUs (n_jobs=-1).
     kwargs.setdefault('n_jobs', -1)
     classifier = KMeans(*args, **kwargs)
     classifier.fit(self._matrix)
     theorical_centers = classifier.cluster_centers_
     center_hosts_idx = []
     dist = classifier.transform(self._matrix)
     for i_center in xrange(len(theorical_centers)):
         center_hosts_idx.append(np.argsort(dist[:, i_center])[0])
     cluster_labels = classifier.predict(self._matrix)
     self._clusters = KmeansClusters(kwargs['n_clusters'], cluster_labels,
                                     theorical_centers, center_hosts_idx)
Beispiel #49
0
def run_clustering(tags):
    # vector of tags acheaved from other models
    # tags = ['muslim','holy']
    print "Reading word2vec model"
    model = utils_word2vec.read_word2vec()
    word_vectors = model.syn0
    num_clusters = 2*len(tags) - 1

    print  model.most_similar('iraq')

    # Initalize a k-means object and use it to extract centroids
    print "Running K means"
    kmeans_clustering = KMeans( n_clusters = num_clusters)
    kmeanFit = kmeans_clustering.fit( word_vectors )
    idx = kmeanFit.labels_
    centers = kmeanFit.cluster_centers_

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip( model.index2word, idx ))


    clusterDist = kmeans_clustering.transform( word_vectors )
    print clusterDist.shape
    cluster_tags = []
    for i in range(0,num_clusters - 1):
        cluster_tags.append(model.index2word[np.argmax(clusterDist[:,i])])
        mymax = np.argmax(clusterDist[:,i])
#        print np.argmax(clusterDist[:,i])
#        print "word" + model.index2word[mymax]

    centroid_word_map = dict(zip(centers, cluster_tags ))

    # Retrun relenavt claster tags
    top_clusters = create_bag_of_centroids( tags, word_centroid_map, centroid_word_map ).bag_of_centers

    print "Bag of centroids tags: \n"
    print top_clusters

    return top_clusters
            o2o[ ind_dic[pair[1]], ind_dic[pair[0]], idx] = o2o[ind_dic[pair[0]], ind_dic[pair[1]], idx]

    # local prediction and blacklist generation part - this dictionary holds each contributor's local blacklist
   
    print 'Computing local predictions...'
    l_blacklists = dict()
    l_blacklists = ts.local_prediction(top_targets, train_set, i)      
    
    # clustering part 
    for n_clusters in clusters_values:
        
        print 'Kvalue: ', n_clusters
        estimator = KMeans(n_clusters=n_clusters)
        X_train = o2o.sum(axis=2)
        labels = estimator.fit( X_train ).labels_ 
        distances = estimator.transform(X_train)
        
        # initial clusters
        clusters = [ np.where(labels == f)[0] for f in range(n_clusters)]
                
        # keep in the cluster only those contributors that satisfy the distance threshold
        c = []
        for id1, cluster in enumerate(clusters):
            b = [distances[id][id1] for id in cluster]
            threshold = np.percentile(b, 40)
            c.append([idx for idx in cluster if distances[idx][id1] <= threshold])   
        
        clusters = [ top_targets[idy] for idy in c]
        
        conts_in_clusters = set()
        for m in clusters:
    def cluster(self, page_dict_list):
        
        pd = PagesDataset(page_dict_list)
        self.pd = pd

        X_scaled = pd.features_lists
        
        print X_scaled
        
#         X_reduced = PCA(n_components=2).fit_transform(X_scaled)
#         print X_reduced
        
        kmeans = KMeans(k=self._K, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
        kmeans.fit(X=X_scaled, y=None)
        
#         print( '% 9s   %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
#           % ("Example", kmeans.inertia_,
#              metrics.homogeneity_score(labels, kmeans.labels_),
#              metrics.completeness_score(labels, kmeans.labels_),
#              metrics.v_measure_score(labels, kmeans.labels_),
#              metrics.adjusted_rand_score(labels, kmeans.labels_),
#              metrics.adjusted_mutual_info_score(labels,  kmeans.labels_),
#              metrics.silhouette_score(X_scaled, kmeans.labels_,
#                                       metric='euclidean',
#                                       sample_size=3)))
        
        Y = kmeans.predict(X_scaled)
        X_new = kmeans.transform(X_scaled, y=None)
        print Y
        print X_new
        print kmeans.cluster_centers_
        
        pages_dict_list_clusters = [[],[]]
        
        for i,(page_dict,cluster_no,distance_pair) in enumerate((zip(page_dict_list,Y,X_new))):
            page_dict['ecom_kmeans_dist'] = distance_pair[cluster_no]
            pages_dict_list_clusters[cluster_no].append(page_dict)
            
        for i,pages_dict_list_cluster in enumerate(pages_dict_list_clusters):
            pages_dict_list_clusters[i] = sorted(pages_dict_list_cluster,key=lambda dict: dict['ecom_kmeans_dist'])
        
        product_pages_indices_list = []
        category_pages_indices_list = []
        
        cluster0_label = self.__label_cluster0(kmeans.cluster_centers_)
        if cluster0_label == "cat":
            cat_no = 0
            prod_no = 1
            retval = True
        elif cluster0_label == "prod":
            cat_no = 1
            prod_no = 0
            retval = True
        else:
            retval = False
        
        if retval == True:
            self.product_pages_dict_list = pages_dict_list_clusters[prod_no]
            self.category_pages_dict_list = pages_dict_list_clusters[cat_no]
            
            for i,cluster_no in enumerate(Y):
                if cluster_no == prod_no:
                    product_pages_indices_list.append(i)
                elif cluster_no == cat_no:
                    category_pages_indices_list.append(i)
            
            for dic in self.product_pages_dict_list:
                dic['category'] = 'ecom_product'
            for dic in self.category_pages_dict_list:
                dic['category'] = 'ecom_category'
            
            self.product_cluster_center = kmeans.cluster_centers_[prod_no]
            self.category_cluster_center = kmeans.cluster_centers_[cat_no]
            self.product_cluster_50pc_dist, self.product_cluster_80pc_dist = self.__get_50pc_80pc_distance(self.product_pages_dict_list)
            self.category_cluster_50pc_dist, self.category_cluster_80pc_dist = self.__get_50pc_80pc_distance(self.category_pages_dict_list)
            self.product_pages_indices_list = product_pages_indices_list
            self.category_pages_indices_list = category_pages_indices_list
        
        return retval
Beispiel #52
0
def run_stack(SEED):

	model = ""

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("../train.csv", skipFirstLine = True, split = ",")
	test = csv_io.read_data("../test.csv", skipFirstLine = True, split = ",")


	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	#SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
	
	# http://stackoverflow.com/questions/15150339/python-memory-error-sklearn-huge-input-data
	#For high dimensional sparse data and many samples, LinearSVC, LogisticRegression, 
	# PassiveAggressiveClassifier or SGDClassifier can be much faster to train for comparable predictive accuracy.
	
	# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
	# LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None)
	# PassiveAggressiveClassifier(C=1.0, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False)
	# SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
	
	clfs = [RandomForestClassifier(n_estimators=500, n_jobs=1, criterion='gini')
	] 	

	# best SVC(C=1000000.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1),
	# best LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None),

	
	#SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True,tol=0.001, verbose=False)
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	#targetPre = [x[0] for x in trainBase]
	#trainPre = [x[1:] for x in trainBase]
	#trainPreTemp = [x[1:] for x in trainBase]
	#testPre = [x[1:] for x in test]

	targetPre = [int(x[0]) for x in trainBase]
	trainPre = [[int(i) for i in x[1:]] for x in trainBase]
	trainPreTemp = [[int(i) for i in x[1:]] for x in trainBase]
	testPre = [[int(i) for i in x[1:]] for x in test]
	
	print "unique: ", len(list(set([x[1] for x in trainBase])))
	
	#enc = OneHotEncoder()
	#print len(trainPreTemp)
	#trainPreTemp.extend(testPre)
	#print len(trainPreTemp)
	#enc.fit(trainPreTemp)
	#print enc.n_values_
	#print enc.feature_indices_
	
	#out = enc.transform(trainPre)
	#trainPre = out#.toarray()
	#print out.shape # len(out), len(out[0])
	
	#out = enc.transform(testPre)
	#testPre = out#.toarray()
	#print out.shape
	
	km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1).fit(trainPre)
	
	
	
	#return
	
	
	#print trainPre[0]
	#scaler = preprocessing.Scaler().fit(trainPre)
	#trainScaled = scaler.transform(trainPre)
	#testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		#Folds = cross_validation.StratifiedKFold(targetPre, n_folds=NumFolds, indices=True)
		Folds = cross_validation.KFold(len(trainBase), n_folds=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainPre[i] for i in train_index]
			
			#train = trainPre.tocsr()[train_index,:]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainPre[i] for i in test_index]	
			
			#trainTest = trainPre.tocsr()[test_index,:]
	
	
			print
			print "Iteration: ", foldCount
			#print "LEN: ", len(train), len(target)
		
			train = km.transform(train)
			trainTest = km.transform(trainTest)
		
			clf.fit(train, target)
			print "Predict"
			prob = clf.predict_proba(trainTest) 
			print "Score"
			dataset_blend_train[test_index, ExecutionIndex] = prob[:,1]



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			fpr, tpr, thresholds = metrics.roc_curve(targetTest, prob[:,1], pos_label=1)
			auc = metrics.auc(fpr,tpr)
			print "Score: ", auc
			
			#for i in range(0, len(prob)):
				#print prob
				#probX = prob[i]

				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			#print "Score: ", probSum/weightSum
 
			avg += 	auc/NumFolds

			predicted_probs = clf.predict_proba(testPre) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] #[0]
		
				
			foldCount = foldCount + 1
		
			break
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
def _find_accuracy(home, appliance, feature="Monthly", num_homes=5):
    np.random.seed(42)
    appliance_df = df.ix[all_homes[appliance]]
    if appliance=="hvac":
        start, stop=5, 11
    else:
        start, stop=1, 13

    test_homes = [home]
    train_d = appliance_df[~appliance_df.index.isin([home])]
    train_d_index = train_d[['%s_%d' %(appliance, i) for i in range(start, stop)]].dropna().index
    train_d_feature = train_d.ix[train_d_index][feature_map[feature]].dropna()

    from sklearn.cluster import KMeans
    c = KMeans(n_clusters=num_homes)
    c.fit(train_d_feature)
    to_use = []
    for i in range(num_homes):
        d = c.transform(train_d_feature)[:, i]
        ind = np.argsort(d)[::-1][:num_homes]
        flag=False
        start_index = 0
        while flag is False:

            if train_d_feature.index.values[ind[start_index]] not in to_use:
                to_use.append(train_d_feature.index.values[ind[start_index]])
                flag=True
            else:
                start_index = start_index+1

    train_homes = np.array(to_use)
    all_home_appliance = deepcopy(all_homes)
    all_home_appliance[appliance] = train_homes

    # Cross validation on inner loop to find best feature, K
    train_size = len(train_homes)
    l = LeaveOneOut(train_size)
    out = OrderedDict()
    for cv_train, cv_test in l:

        cv_train_home=appliance_df.ix[train_homes[cv_train]]
        cv_train_index = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].dropna().index
        cv_train_home = cv_train_home.ix[cv_train_index]
        cv_test_home = appliance_df.ix[train_homes[cv_test]]
        test_home_name = cv_test_home.index.values[0]
        #print cv_test_home
        out[test_home_name]={}


        # Summing up energy across start to stop to get Y to learn optimum feature on
        Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values
        forest = ExtraTreesRegressor(n_estimators=250,
                              random_state=0)
        forest.fit(cv_train_home[feature_map[feature]], Y)
        importances = forest.feature_importances_
        indices = np.argsort(importances)[::-1]

        # Now varying K and top-N features

        for K in range(K_min, K_max):
            out[test_home_name][K]={}
            for top_n in range(F_min,F_max):
                out[test_home_name][K][top_n]=[]
                top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n]

                # Now fitting KNN on this
                for month in range(start, stop):
                    clf = KNeighborsRegressor(n_neighbors=K)
                    clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)])
                    #print clf.predict(cv_test_home[top_n_features]), month
                    out[test_home_name][K][top_n].append(clf.predict(cv_test_home[top_n_features]))

        # Now, finding the (K, top_n) combination that gave us best accuracy on CV test homes
    accur = {}

    for K in range(K_min, K_max):
        accur[K] = {}
        for top_n in range(F_min, F_max):
            temp = {}
            for h in out.iterkeys():
                pred = pd.DataFrame(out[h][K][top_n]).T
                #all_but_h = [x for x in out.keys() if x!=h]
                pred.index = [h]
                pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]]
                gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]]
                error = (pred-gt).abs().div(gt).mul(100)
                mean_error = error.mean().mean()
                a = 100-mean_error
                if a<0:
                    a=0
                temp[h]=a
            ac = pd.Series(temp).mean()

            accur[K][top_n] = ac

    accur_df = pd.DataFrame(accur)
    accur_max = accur_df.max().max()
    max_ac_df = accur_df[accur_df==accur_max]
    F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist()
    K_best = max_ac_df.mean().dropna().index.values[0]

    # Now predicting for test home
    train_overall = appliance_df.ix[appliance_df[~appliance_df.index.isin([home])].index]
    test_overall = appliance_df[appliance_df.index.isin([home])]
    pred_test = {}
    gt_test = {}
    for month in range(start, stop):
        clf = KNeighborsRegressor(n_neighbors=K_best)
        clf.fit(train_overall[F_best], train_overall['%s_%d' %(appliance, month)])
        pred_test[month] = clf.predict(test_overall[F_best])
        gt_test[month] = test_overall['%s_%d' %(appliance, month)]


    json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open(os.path.expanduser("~/main-out-new-larger-num-homes/%d_%s_%s_%d.json" %(num_homes, appliance,feature, home)),"w") )

    pred_df = pd.DataFrame(pred_test)
    pred_mean = df.ix[train_homes][['%s_%d' %(appliance, month) for month in range(start, stop)]].mean()
    pred_df.index = [home]
    #gt_df = pd.DataFrame(gt_test)
    #print pred_df, gt_df
    #error = (gt_df-pred_df).abs().div(gt_df).mul(100)
    #print error
    #accuracy_test = 100-error
    #accuracy_test[accuracy_test<0]=0
    gt_df =df.ix[home][['%s_%d' %(appliance, month) for month in range(start, stop)]]
    gt_df.index = pred_df.columns
    pred_mean.index = pred_df.columns
    #return accuracy_test.squeeze()
    return pred_df, pred_mean,gt_df
Beispiel #54
0
    rf = RFC(n_estimators=100, criterion='entropy')
    svm = SVC(kernel='rbf', probability=True)
    lr = LR()
    bl = [rf, lr, svm] #set of base learner
    for b in bl:
        b.fit(train_fd, train_label) #train each base classifier
        #print b

    test_fn = fn
    label = test_label
    class_ = np.unique(train_label)
    n_class = 32/2
    c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
    c.fit(test_fn)
    dist = np.sort(c.transform(test_fn))
    ex = dd(list) #example id, distance to centroid
    ex_id = dd(list) #example id for each C
    for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist):
        ex[i].append([j,k[0]])
        ex_id[i].append(int(j))
    for i,j in ex.items():
        ex[i] = sorted(j, key=lambda x: x[-1]) #sort ex in each C by dist to centroid
    nb_c = dd()
    for exx in ex_id.values():
        exx = np.asarray(exx)
        for e in exx:
            nb_c[e] = exx[exx!=e] #create a dict of nb by C for each ex
    nb_f = [dd(), dd(), dd()]
    for b,n in zip(bl, nb_f):
        preds = b.predict(test_fd)
# In[25]:

data = df_clean_d.values
data


# #  D)   Plot and analysis of LifeMale and LifeFemale

# ## Application of KMeans on data

# In[177]:

clust=KMeans(n_clusters=3,n_init=10,init='k-means++',verbose=0)#
Ckm=clust.fit_predict(data[:,3:5]) # Compute cluster centers and predict cluster index for each sample.
data_d=clust.transform(data[:,3:5]) # #pour avoir les distances de chaque élément aux centres des clusters


# In[179]:

color=('g','b','r')
label = ('First cluster', 'Second cluster','Third cluster')
country0 = data[Ckm==0,0]
country1 = data[Ckm==1,0]
country2 = data[Ckm==2,0]



# ## Visualisation of result of KMean applied on lifeMale and lifeFemale

# In[339]:
class ContentRecommend(object):
    create_date = datetime.utcnow()
    days = 15
    training_end = datetime.utcnow()
    db = None
    n_components = 20  # Number of dimension for TruncatedSVD
    account = ''
    svd = None
    normalizer = None
    svdX = None
    vectorizor = None
    training_docs = None
    threshold = 0.25
    k_means = None
    sil_score = -1.0
    cluster_count = 0
    range_n_clusters = [3, 4, 5, 6, 7, 8]
    missionId = ''

    def __init__(self, mission_id, db_name='plover_development', db_port=27017, db_host='localhost'):
        self.missionId = mission_id
        config.LOGGER.info('Instantiation recommender')
        self.connect(db_name, self.missionId, db_port=db_port, db_host=db_host)
        config.LOGGER.debug("Loading NLTK stopword list for English")

    def connect(self, db_name="plover_development", mission_id="", db_port=27017, db_host='localhost'):
        config.LOGGER.info('Instantiating recommender object for mission %s', mission_id)
        config.LOGGER.debug('Using database %s, host %s and port %s', db_name, db_host, db_port)

        try:
            client = MongoClient(db_host, db_port)
            self.db = client[db_name]
            profile = self.db.socialProfile.find_one({'mission': ObjectId(self.missionId)})
            self.account = self.db.linkedAccount.find_one({'_id': profile['account']})
            if self.account is None:
                config.LOGGER.debug('No such account id')
            self.setup_training(days=30)
        except Exception as ex:
            config.LOGGER.error("Error %s opening mission _id=%s", ex.message, self.missionId)


    def get_updates(self, maximum=100, conditions={}):
        documents = []
        config.LOGGER.info('Getting timeline updates for mission %s', self.missionId)
        config.LOGGER.debug(' query condition: %s', json.dumps(conditions, default=json_util.default))
        try:
            if self.account is None:
                config.LOGGER.debug('No account id')
            else:
                projection = {'keywords': 1, 'text': 1, 'externalID': 1, 'postTime': 1, 'sender': 1,
                              'quotedStatus': 1}
                updates = self.db.statusUpdate.find(conditions, projection).sort('postTime', pymongo.DESCENDING).limit(maximum)
                for tw in updates:
                    if 'quotedStatus' in tw:
                        tw['text'] += " QT " + tw['quotedStatus']['text']
                        for keyword in tw['quotedStatus']['keywords']:
                            tw['keywords'].append(keyword)
                    smu = self.db.socialMediaUser.find_one({'_id': tw['sender']}, {'screenNameLC': 1})
                    if smu is not None:
                        tw['keywords'].append(smu['screenNameLC'])
                    documents.append(tw)


        except Exception as ex:
            config.LOGGER.error("Error %s getting updates from timeline for mission %s", ex.message, self.missionId)

        config.LOGGER.debug('Found %d updates in timeline', len(documents))
        return documents



    def topics(self, n_components, n_out=7, n_weight=5, topic=None):
        config.LOGGER.info('Get topices timeline for %s', self.account['profile']['preferredUsername'])
        results = []
        terms = self.vectorizer.get_feature_names()
        if topic is None:
            for k in range(n_components):
                idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])}
                sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
                weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])

                for item in sorted_idx[0:n_out - 1]:
                    results.append({'term': terms[item[0]], 'weight': item[1]})
        else:
            m = max(self.svd.components_[topic])
            idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])}
            sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
            weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])

            for item in sorted_idx[0:n_out - 1]:
                results.append({'term': terms[item[0]], 'weight': item[1]})
        results

    def get_componentCount(self, min=.05):
        count = 0
        for k in range(len(self.svd.components_)):
            idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])}
            sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True)
            kcount = 0
            for entry in (sorted_idx):
                if entry[1] > min:
                    kcount += 1
                else:
                    break
            if kcount > count:
                count = kcount
        return count

    def setup_training(self, end_time=datetime.utcnow(), days=15, maximum=1000):
        try:
            start = end_time - timedelta(minutes=days*24*60)
            condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': True}, {'sentByMe': True}],
                        'postTime': {'$gt': start, '$lte': end_time},
                         '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]}
            self.training_docs = self.get_updates(conditions=condition, maximum=10000)
            config.LOGGER.info('Train model for %s', self.account['profile']['preferredUsername'])
            if len(self.training_docs) > 50:
                config.LOGGER.debug('Found %d updates for training from %s', len(self.training_docs),
                                    self.account['profile']['preferredUsername'])
                self.training_end = end_time
                self.days = days

                trainingRaw = [' '.join(doc['keywords']) for doc in self.training_docs]
                #trainingRaw = [tw['text'] for tw in self.training_docs]
                self.vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=500, use_idf=True,
                                                  strip_accents='ascii', )
                X = self.vectorizer.fit_transform(trainingRaw)
                if X.shape[1] <= self.n_components:
                    self.n_components = X.shape[1] - 1
                config.LOGGER.debug('%d components found for  SVD', self.n_components)
                self.svd = TruncatedSVD(self.n_components, algorithm='arpack')
                self.svdX = self.svd.fit_transform(X)
                # self.n_components = self.get_componentCount(self.threshold)
                # self.svd = TruncatedSVD(self.n_components, random_state=10)
                # self.svdX = self.svd.fit_transform(X)
                self.normalizer = Normalizer().fit(self.svdX)
                self.svdX = self.normalizer.transform(self.svdX)

                # Clustering
                config.LOGGER.debug('Determining cluster count ')
                for n_clusters in self.range_n_clusters:
                    self.k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10,
                                          verbose=False, random_state=10)
                    self.k_means.fit(self.svdX)
                    score = metrics.silhouette_score(self.svdX, self.k_means.labels_)
                    if score > self.sil_score:
                        self.sil_score = score
                        self.cluster_count = n_clusters

                config.LOGGER.debug('Cluster count is %d, Silhouette Coefficient is %0.3f  ', self.cluster_count,
                                    self.sil_score)
                self.k_means = KMeans(n_clusters=self.cluster_count, init='k-means++', max_iter=100, n_init=4,
                                      verbose=False, random_state=10)
                self.k_means.fit(self.svdX)

                # now get the top tweets for each cluster
                x_transform = self.k_means.transform(self.svdX)
                x_predict = self.k_means.predict(self.svdX)

                self.all_cluster_dist = []
                for i in range(self.cluster_count):
                    cluster_distance = []
                    for j in range(len(x_predict)):
                        if x_predict[j] == i and sum(self.svdX[j]) != 0.0:
                            cluster_distance.append(
                                {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in x_transform[j]]))})
                    newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False)
                    self.all_cluster_dist.append(newlist)

                #now verify this
                self.self_test()

            else:
                config.LOGGER.info('Too few training updates from user timeline')
                self.svd = None
        except Exception as ex:
            config.LOGGER.exception("Error %s computing SVD and kmeans from user history for mission %s", ex.message,
                                self.missionId)


    def self_test(self):
        try:
            config.LOGGER.info("Beginning self test. Better if it were cross validation but not enough data for that")
            results = self.find_recommendations(self.training_docs, top=10, quality=.001, min_examples=1)
            config.LOGGER.info("Self test found %d recommendations", len(results))
            for rec in results:
                if rec['text'] != rec['samples_svd'][0]:
                    config.LOGGER.error("Error training SVD for mission %s in tweet %s", self.missionId, rec['text'])
        except Exception as ex:
            config.LOGGER.error("Error in self test building training for mission %s", ex.message, self.missionId)


    def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1):

        working_list = []
        result_list = []
        try:
            config.LOGGER.info('Generating content recommendations for user %s',
                               self.account['profile']['preferredUsername'])
            if self.svd is not None:
                if len(tweets) < top:
                    config.LOGGER.debug("Too few tweets passed for recommendation")
                    return []

                #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets]
                #tweetText = [tw['text'] for tw in tweets]
                tweetText = [' '.join(tw['keywords']) for tw in tweets]
                Y = self.vectorizer.transform(tweetText)
                svdY = self.svd.transform(Y)
                svdY = self.normalizer.transform(svdY)
                y_transform = self.k_means.transform(svdY)
                # terms = self.vectorizer.get_feature_names()

                selected_updates = []
                y_predict = self.k_means.predict(svdY)

                for i in range(self.cluster_count):
                    cluster_distance = []
                    for j in range(len(y_predict)):
                        if y_predict[j] == i and sum(svdY[j]) != 0.0:
                            cluster_distance.append(
                                {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))})
                    newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False)
                    selected_updates.append(newlist)

                temp = [entry for entry in it.izip_longest(*selected_updates)]
                clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top]
                clean_list_svdY = [svdY[entry['index']] for entry in clean_list]
                config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY))

                neigh = NearestNeighbors()
                neigh.fit(self.svdX)
                if len(clean_list_svdY) > 0:
                    distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality)
                else:
                    svd_neighbors =[]

                examples=[]
                for idx, entry in enumerate(svd_neighbors):
                    if len(entry) >= min_examples:
                        config.LOGGER.debug("Suggested tweet has %d examples" % len(entry))
                        original = tweets[clean_list[idx]['index']]['text']
                        for jdx, neighbor in enumerate(entry):
                            examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]})
                        sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False)
                        min_examples = [item['text'] for item in sorted_examples][:min_examples]
                        t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text']
                        t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text']
                        working_list.append({"dist": sorted_examples[0]['dist'], "text": original,
                                                     "id": str(tweets[clean_list[idx]['index']]['_id']),
                                                     "sender": str(tweets[clean_list[idx]['index']]['sender']),
                                                     'samples_svd': min_examples, 'samples_cluster':[t1,t2]})

                result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False)
            return result_list[:top]

        except Exception as ex:
            config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId)
            return []

    def recommend_from_timeline(self, end_time=datetime.utcnow(), minutes_prior=15, top=10, quality=.1, min_examples=1):

        try:
            config.LOGGER.info("generating content recommendation from timeline for %s" % self.account['profile']['preferredUsername'])
            results = []
            if self.svd is not None:
                start = end_time - timedelta(minutes=minutes_prior)
                condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': False}, {'sentByMe': False}, {'mentionsMe' : False},{'retweetOfMe':False}],
                            'postTime': {'$gt': start, '$lte': end_time},
                              '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]}
                tweets = self.get_updates(maximum=10000, conditions=condition)
                config.LOGGER.debug('%d updates from account timeline read from database', len(tweets))
                results = self.find_recommendations(tweets, top=top, quality=quality, min_examples=min_examples)
                config.LOGGER.debug('%d recommendations found for mission %s', len(tweets), self.missionId)
            return results[:top]

        except Exception as ex:
            config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId)
            return []

# # Reduce dimensions and apply clustering

# In[6]:

pca = PCA(n_components=50)
X = pca.fit_transform(array(image_features))


# In[7]:

kmeans = KMeans(n_clusters = num_clusters, n_init = 100, n_jobs=1)
kmeans.fit(X)
clusters = kmeans.predict(X)
clusters_space = kmeans.transform(X)


# In[8]:

image_paths_hack = list(hog_features["image_paths"])
image_paths_rel = [image_path_hack.split("/")[-2] + "/" + image_path_hack.split("/")[-1] for image_path_hack in image_paths_hack]


# In[9]:

res_df = pd.DataFrame({'file_names' : image_paths_rel})


# In[10]:
Beispiel #58
0
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

#    sys.stdout = open('a_projpath' +'output.txt','w')
#    print a_driver['DStats']
    
    X = StandardScaler().fit_transform(a_driver['DStats'])
    

    
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
    pca = PCA(n_components=3)
    Xpca = pca.fit(X).transform(X)
    
    if plotflag == True:
        
        fig = scatterplot_matrix(np.transpose(Xpca)
                                                , ['PC1'
                                                , 'PC2'
                                                , 'PC3'
#                                                , 'PC4'
#                                                ,'PC5'
                                                ]
                                                ,linestyle='none', marker='o', color='black', mfc='none')
        fig.suptitle('Simple Scatterplot Matrix')
        plt.show()
        

    db = KMeans(n_clusters=3,n_jobs = -1).fit(Xpca)
    minDist = db.transform(Xpca).min(axis=1) # Get  distance for each point from the nearest cluster

    zMinDist = sp.stats.mstats.zscore(minDist) # Convert to z-score i.e. Normalize the distances
#    zMinDist = abs(sp.stats.mstats.zscore(minDist)) # Convert to z-score i.e. Normalize the distances
    
    
#    plt.figure()
#    plt.hist(zMinDist,range=(-3,3))

#    print zMinDist.round(2)
#    print "###############################################################################"
#    print sp.stats.kstest(zMinDist,"expon" ,(0,.715))   
#    print sp.stats.kstest(zMinDist,"expon" ,(0,.72))
    
    tef = sp.stats.norm.fit(zMinDist)
#    tef = sp.stats.expon.fit(zMinDist)
    
#    plt.hist(sp.stats.expon.rvs(0,tef[1],size=200))
#    print tef
#    print sp.stats.kstest(zMinDist,"expon" ,(0,.72))
#    print sp.stats.kstest(zMinDist,"expon" ,(0,.75))
#    print sp.stats.kstest(zMinDist,"expon" ,(0,76))

#    print "###############################################################################"

#    probZMinDist = sp.stats.expon.pdf(zMinDist.round(2),loc=tef[0],scale=tef[1]) # Find the probability distribution this belongs to and get the probability

    probZMinDist = sp.stats.expon.pdf(zMinDist,scale=1/tef[1]) # Find the probability distribution this belongs to and get the probability
#    plt.figure()
#    plt.hist(probZMinDist)   
#    print probZMinDist.round(2)
    
    
#    XpcaMean = Xpca.mean(axis=0)
#    XpcaDistFromMean = np.array([],float)
    
#    print "Xpca::", Xpca    
#    print "XpcaMean::", XpcaMean
#    print "XpcaDistFromMean::", XpcaDistFromMean
    
#    for i in range(0,len(Xpca)):
#        XpcaDistFromMean = np.append(XpcaDistFromMean, np.linalg.norm(Xpca[i]-XpcaMean))
    
#    temp = (1-sp.stats.norm.cdf(sp.stats.mstats.zscore(XpcaDistFromMean)))
#    temp = temp.round(2)
#    
#    print temp.round(2)
#    plt.figure()
#    plt.hist(temp,range=(0,1))
#    plt.plot(temp,'b^')
#    plt.show()
    

    
#    db = DBSCAN(eps=0.7).fit(Xpca)
    
#    db = AgglomerativeClustering(n_clusters=5).fit(Xpca)
        
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    
#    print Counter(labels)
#    print labels
    
#    print "###############################################################################"
    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
#    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(Xpca, labels))
    print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_)
#    print "##############################DBSCAN  X Below#################################################"
    
    return probZMinDist ##KMeans ZScores
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)



#number of clusters
clusters = 2


km = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=10,verbose=opts.verbose)

result = csr_matrix(result)
km.fit(result)
if not os.path.exists('../../data/result'):
	os.mkdir('../../data/result')
dest = '../../data/result/raw_result'
if not os.path.exists(dest):
	os.mkdir(dest)
else:
	shutil.rmtree(dest)
	os.mkdir(dest)
#order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(clusters):
	print("Cluster %d:" % i, end='')
	d = km.transform(result)[:, i]
	ind = np.argsort(d)[::-1][:10]
	f = open(dest +'/cluster' + str(i),'w')	
	for index in ind:
		print(' %s' % index, end='\n')
		f.write(dataset.filenames[index] + '\n')
Beispiel #60
0
            "cc_team_score_frame": cc_team_score_frame,
            "to_team_score_frame": to_team_score_frame,
        }
    )
    return df_result
    # return (cc_score_frame,to_score_frame,to_team_score_frame)


if __name__ == "__main__":
    t = main(50000)
    #    t =  main()
    t = t.fillna(0)

    kclust = KMeans(n_clusters=4)
    kclust.fit(t)
    clustered = kclust.transform(t)

    X = t
    fignum = 1
    fig = plt.figure(fignum, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)

    plt.cla()
    labels = kclust.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])